"""MCP server with semantic search over documents fetched from GitHub.
Uses ChromaDB for vector storage and OpenAI embeddings for semantic search.
Documents are fetched from the GitHub repo and indexed on startup.
"""
import hashlib
import json
import os
import sys
import urllib.request
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from mcp.server.fastmcp import FastMCP
GITHUB_REPO = "itsphily/mcp_generix"
GITHUB_BRANCH = "main"
DOCS_PATH = "docs"
CHROMA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".chroma")
CHUNK_SIZE = 1000 # characters per chunk
CHUNK_OVERLAP = 200
def github_api(endpoint: str) -> dict | list:
"""Fetch from GitHub API."""
url = f"https://api.github.com/{endpoint}"
req = urllib.request.Request(url, headers={"Accept": "application/vnd.github.v3+json"})
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read().decode())
def fetch_docs_from_github() -> list[dict]:
"""Fetch all documents from the GitHub repo's docs/ folder.
Returns list of {filename, content} dicts.
"""
tree = github_api(f"repos/{GITHUB_REPO}/git/trees/{GITHUB_BRANCH}?recursive=1")
doc_files = []
for item in tree.get("tree", []):
path = item["path"]
if (
path.startswith(f"{DOCS_PATH}/")
and item["type"] == "blob"
and not path.endswith(".gitkeep")
):
relative = path[len(f"{DOCS_PATH}/"):]
# Fetch file content
blob = github_api(f"repos/{GITHUB_REPO}/contents/{path}?ref={GITHUB_BRANCH}")
if blob.get("encoding") == "base64":
import base64
content = base64.b64decode(blob["content"]).decode("utf-8")
else:
content = blob.get("content", "")
doc_files.append({"filename": relative, "content": content})
return doc_files
def chunk_text(text: str, filename: str) -> list[dict]:
"""Split text into overlapping chunks for better retrieval."""
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + CHUNK_SIZE
chunk = text[start:end]
if chunk.strip():
chunks.append({
"id": f"{filename}::chunk{chunk_index}",
"text": chunk,
"metadata": {"source": filename, "chunk_index": chunk_index},
})
chunk_index += 1
start += CHUNK_SIZE - CHUNK_OVERLAP
return chunks
def content_hash(content: str) -> str:
"""Get hash of content string for change detection."""
return hashlib.md5(content.encode("utf-8")).hexdigest()
def index_documents(collection: chromadb.Collection) -> str:
"""Fetch docs from GitHub and index into ChromaDB. Returns status message."""
try:
doc_files = fetch_docs_from_github()
except Exception as e:
return f"Error fetching from GitHub: {e}"
# Get existing document sources from the collection
existing = collection.get(include=["metadatas"])
existing_sources = set()
existing_ids = {}
for i, meta in enumerate(existing["metadatas"] or []):
source = meta.get("source", "")
existing_sources.add(source)
existing_ids.setdefault(source, []).append(existing["ids"][i])
# Track current files
current_sources = set()
added = 0
updated = 0
for doc in doc_files:
filename = doc["filename"]
content = doc["content"]
current_sources.add(filename)
c_hash = content_hash(content)
# Check if file already indexed with same content
if filename in existing_sources:
source_ids = existing_ids.get(filename, [])
if source_ids:
existing_meta = collection.get(ids=[source_ids[0]], include=["metadatas"])
if existing_meta["metadatas"] and existing_meta["metadatas"][0].get("hash") == c_hash:
continue # File unchanged, skip
# File changed — remove old chunks
collection.delete(ids=existing_ids[filename])
updated += 1
else:
added += 1
# Chunk and add document
chunks = chunk_text(content, filename)
if chunks:
collection.add(
ids=[c["id"] for c in chunks],
documents=[c["text"] for c in chunks],
metadatas=[{**c["metadata"], "hash": c_hash} for c in chunks],
)
# Remove deleted files
removed = 0
for source in existing_sources - current_sources:
ids_to_remove = existing_ids.get(source, [])
if ids_to_remove:
collection.delete(ids=ids_to_remove)
removed += 1
return f"Indexed: {added} added, {updated} updated, {removed} removed. Total files: {len(doc_files)}."
# --- Initialize ChromaDB and embedding function ---
embedding_fn = OpenAIEmbeddingFunction(
api_key=os.environ.get("OPENAI_API_KEY", ""),
model_name="text-embedding-3-small",
)
client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(
name="docs",
embedding_function=embedding_fn,
)
# Index documents on startup
print(index_documents(collection), file=sys.stderr)
# --- MCP Server ---
mcp = FastMCP("generix-docs")
@mcp.tool()
def list_docs() -> str:
"""List all documents available in the GitHub documentation repo."""
try:
docs = fetch_docs_from_github()
except Exception as e:
return f"Error fetching from GitHub: {e}"
if not docs:
return "No documents found."
return "\n".join(d["filename"] for d in docs)
@mcp.tool()
def read_doc(filename: str) -> str:
"""Read the full contents of a specific document from the GitHub repo.
Args:
filename: The document filename (e.g. 'Basile.txt' or 'subfolder/doc.md')
"""
try:
docs = fetch_docs_from_github()
except Exception as e:
return f"Error fetching from GitHub: {e}"
for doc in docs:
if doc["filename"] == filename:
return doc["content"]
return f"Error: Document '{filename}' not found."
@mcp.tool()
def search_docs(query: str, n_results: int = 5) -> str:
"""Semantic search across all documents. Returns the most relevant passages.
Args:
query: The question or topic to search for
n_results: Number of results to return (default 5)
"""
total = collection.count()
if total == 0:
return "No documents indexed yet. Try running reindex_docs first."
n = min(n_results, total)
results = collection.query(query_texts=[query], n_results=n)
if not results["documents"] or not results["documents"][0]:
return f"No results found for '{query}'."
output = []
for doc, meta, distance in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
):
source = meta.get("source", "unknown")
score = round(1 - distance, 3) # Convert distance to similarity
output.append(f"--- {source} (relevance: {score}) ---\n{doc.strip()}")
return "\n\n".join(output)
@mcp.tool()
def reindex_docs() -> str:
"""Re-fetch and re-index all documents from GitHub. Run this after adding or removing files."""
return index_documents(collection)
if __name__ == "__main__":
mcp.run(transport="stdio")