Skip to main content
Glama

Confluence Knowledge Base MCP Server

by marcciosilva
confluence_knowledge_base.py13.6 kB
#!/usr/bin/env python3 """ Local Documentation Knowledge Base MCP Server Reads documentation from local files (including PDFs), creates vector embeddings, and provides semantic search capabilities for natural language Q&A with Gemini CLI. """ import os import sys import json import logging from typing import List, Dict, Any, Optional from pathlib import Path import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer from mcp.server.fastmcp import FastMCP import time # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Initialize FastMCP server mcp = FastMCP("Local Documentation Knowledge Base") # Configuration DOCS_DIRECTORY = os.getenv("DOCS_DIRECTORY", "") # Directory containing documentation files INDEX_PATH = Path.home() / ".confluence_mcp" / "index" CHUNK_SIZE = 1000 # Characters per chunk CHUNK_OVERLAP = 200 # Overlap between chunks class DocumentLoader: """Handles loading and processing local documentation files.""" def __init__(self, docs_directory: str): self.docs_directory = Path(docs_directory) if docs_directory else None def load_all_documents(self) -> List[Dict[str, Any]]: """Load all documents from the specified directory.""" if not self.docs_directory: logger.error("DOCS_DIRECTORY not specified") return [] if not self.docs_directory.exists(): logger.error(f"Directory not found: {self.docs_directory}") return [] if not os.access(self.docs_directory, os.R_OK): logger.error(f"No read access to directory: {self.docs_directory}") return [] all_documents = [] supported_extensions = {'.pdf', '.txt', '.md'} logger.info(f"Loading documents from: {self.docs_directory}") for file_path in self.docs_directory.rglob('*'): if file_path.is_file() and file_path.suffix.lower() in supported_extensions: try: content = self._load_file(file_path) if content: all_documents.append({ "id": str(file_path.relative_to(self.docs_directory)), "title": file_path.stem, "source": "local", "content": content, "path": str(file_path), "extension": file_path.suffix }) logger.info(f" Loaded: {file_path.name}") except Exception as e: logger.warning(f" Failed to load {file_path.name}: {e}") logger.info(f"Total documents loaded: {len(all_documents)}") return all_documents def _load_file(self, file_path: Path) -> str: """Load content from a single file based on its type.""" if file_path.suffix.lower() == '.pdf': return self._load_pdf(file_path) else: # .txt, .md, or other text files return self._load_text(file_path) @staticmethod def _load_pdf(file_path: Path) -> str: """Extract text from PDF file.""" try: import PyPDF2 with open(file_path, 'rb') as f: reader = PyPDF2.PdfReader(f) text_parts = [] for page in reader.pages: text = page.extract_text() if text: text_parts.append(text) return '\n'.join(text_parts) except ImportError: logger.warning(f"PyPDF2 not installed, skipping PDF: {file_path.name}") return "" except Exception as e: logger.warning(f"Error reading PDF {file_path.name}: {e}") return "" @staticmethod def _load_text(file_path: Path) -> str: """Load plain text file.""" try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: # Try with latin-1 encoding as fallback try: with open(file_path, 'r', encoding='latin-1') as f: return f.read() except Exception as e: logger.warning(f"Error reading text file {file_path.name}: {e}") return "" @staticmethod def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]: """Split text into overlapping chunks.""" chunks = [] start = 0 while start < len(text): end = start + chunk_size chunk = text[start:end] # Try to break at sentence boundary if end < len(text): last_period = chunk.rfind('.') last_newline = chunk.rfind('\n') break_point = max(last_period, last_newline) if break_point > chunk_size * 0.5: # Only break if we're past halfway chunk = chunk[:break_point + 1] end = start + break_point + 1 chunks.append(chunk.strip()) start = end - overlap return [c for c in chunks if c] # Filter empty chunks class VectorStore: """Manages vector embeddings and similarity search.""" def __init__(self, persist_directory: Path): self.persist_directory = persist_directory self.persist_directory.mkdir(parents=True, exist_ok=True) # Initialize ChromaDB self.client = chromadb.PersistentClient( path=str(persist_directory), settings=Settings(anonymized_telemetry=False) ) # Initialize embedding model print("Loading embedding model...") self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Get or create collection self.collection = self.client.get_or_create_collection( name="confluence_docs", metadata={"hnsw:space": "cosine"} ) def index_documents(self, documents: List[Dict[str, Any]], chunk_size: int, overlap: int): """Index documents into vector store.""" logger.info(f"Indexing {len(documents)} documents...") # Clear existing collection self.client.delete_collection("confluence_docs") self.collection = self.client.create_collection( name="confluence_docs", metadata={"hnsw:space": "cosine"} ) all_chunks = [] all_metadata = [] all_ids = [] for doc in documents: # Create chunks from document content chunks = DocumentLoader.chunk_text(doc["content"], chunk_size, overlap) # Prepare metadata for each chunk for i, chunk in enumerate(chunks): chunk_id = f"{doc['id']}_chunk_{i}".replace('/', '_').replace('\\', '_') all_chunks.append(chunk) all_metadata.append({ "doc_id": doc["id"], "doc_title": doc["title"], "source": doc["source"], "path": doc["path"], "extension": doc["extension"], "chunk_index": i, "total_chunks": len(chunks) }) all_ids.append(chunk_id) logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents") logger.info("Generating embeddings...") # Add to collection in batches batch_size = 100 for i in range(0, len(all_chunks), batch_size): batch_chunks = all_chunks[i:i + batch_size] batch_metadata = all_metadata[i:i + batch_size] batch_ids = all_ids[i:i + batch_size] self.collection.add( documents=batch_chunks, metadatas=batch_metadata, ids=batch_ids ) logger.info(f" Indexed {min(i + batch_size, len(all_chunks))}/{len(all_chunks)} chunks") logger.info("Indexing complete!") def search(self, query: str, n_results: int = 5) -> List[Dict[str, Any]]: """Search for relevant chunks.""" results = self.collection.query( query_texts=[query], n_results=n_results ) chunks = [] for i in range(len(results['documents'][0])): chunks.append({ "content": results['documents'][0][i], "metadata": results['metadatas'][0][i], "distance": results['distances'][0][i] if 'distances' in results else None }) return chunks # Initialize components logger.info("Initializing Local Documentation Knowledge Base...") doc_loader = DocumentLoader(DOCS_DIRECTORY) vector_store = VectorStore(INDEX_PATH) # Load and index documents on startup logger.info("Loading documents from local directory...") documents = doc_loader.load_all_documents() if not documents: logger.warning("No documents were loaded. Please check DOCS_DIRECTORY configuration.") else: logger.info(f"Successfully loaded {len(documents)} document(s)") logger.info("Building vector index...") vector_store.index_documents(documents, CHUNK_SIZE, CHUNK_OVERLAP) # Save metadata index_metadata_file = INDEX_PATH / "metadata.json" with open(index_metadata_file, 'w') as f: json.dump({ "total_documents": len(documents), "docs_directory": str(DOCS_DIRECTORY), "indexed_at": time.time() }, f) logger.info(f"Indexed {len(documents)} document(s) from: {DOCS_DIRECTORY}") # MCP Resources - Provide context on demand @mcp.resource("confluence://knowledge-base") def get_knowledge_base_info() -> str: """Provides information about the local documentation knowledge base.""" index_metadata_file = INDEX_PATH / "metadata.json" if index_metadata_file.exists(): with open(index_metadata_file, 'r') as f: metadata = json.load(f) return f"""Local Documentation Knowledge Base Status: - Total Documents: {metadata.get('total_documents', 0)} - Directory: {metadata.get('docs_directory', 'N/A')} - Last Indexed: {time.ctime(metadata.get('indexed_at', 0))} This knowledge base contains your local documentation files. You can ask natural language questions about your systems, and I'll search the documentation to provide accurate answers. """ return "Knowledge base not initialized yet." # MCP Tools @mcp.tool() def ask_documentation(question: str, num_sources: int = 5) -> str: """ Search the Confluence knowledge base for information relevant to a question. This retrieves the most relevant documentation chunks to answer domain-specific questions. Args: question: The question to ask about your systems/documentation num_sources: Number of relevant document chunks to retrieve (default 5) Returns: Relevant documentation content with source references """ try: # Search vector store results = vector_store.search(question, n_results=num_sources) if not results: return "No relevant documentation found for this question." # Format response output = [f"Found {len(results)} relevant documentation sections:\n"] for i, result in enumerate(results, 1): metadata = result['metadata'] content = result['content'] output.append(f"\n--- Source {i}: {metadata['doc_title']} ({metadata['extension']}) ---") output.append(f"Path: {metadata['path']}") output.append(f"\nContent:\n{content}\n") return "\n".join(output) except Exception as e: return f"Error searching knowledge base: {str(e)}" @mcp.tool() def reindex_documents() -> str: """ Re-load and re-index all local documentation files. Use this when documentation has been updated. Returns: Status of the reindexing operation """ try: logger.info("\nReindexing local documentation...") documents = doc_loader.load_all_documents() if not documents: return "No documents found to index. Please check DOCS_DIRECTORY configuration." vector_store.index_documents(documents, CHUNK_SIZE, CHUNK_OVERLAP) # Save metadata index_metadata_file = INDEX_PATH / "metadata.json" with open(index_metadata_file, 'w') as f: json.dump({ "total_documents": len(documents), "docs_directory": str(DOCS_DIRECTORY), "indexed_at": time.time() }, f) return f"Successfully reindexed {len(documents)} document(s) from: {DOCS_DIRECTORY}" except Exception as e: return f"Error reindexing: {str(e)}" if __name__ == "__main__": # Check for --index-only flag index_only = "--index-only" in sys.argv # Validate configuration if not DOCS_DIRECTORY: logger.warning( "DOCS_DIRECTORY environment variable not set. " "Server will start but no documents will be loaded." ) logger.info("\n" + "="*60) logger.info("Local Documentation Knowledge Base MCP Server Ready!") logger.info("="*60) # If --index-only flag is set, exit after indexing if index_only: logger.info("\nIndex-only mode: Exiting after building index.") sys.exit(0) # Start the MCP server mcp.run()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/marcciosilva/confluence-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server