MCP Generix

server.py•7.26 KiB

"""MCP server with semantic search over documents fetched from GitHub. Uses ChromaDB for vector storage and OpenAI embeddings for semantic search. Documents are fetched from the GitHub repo and indexed on startup. """ import hashlib import json import os import sys import urllib.request import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction from mcp.server.fastmcp import FastMCP GITHUB_REPO = "itsphily/mcp_generix" GITHUB_BRANCH = "main" DOCS_PATH = "docs" CHROMA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".chroma") CHUNK_SIZE = 1000 # characters per chunk CHUNK_OVERLAP = 200 def github_api(endpoint: str) -> dict | list: """Fetch from GitHub API.""" url = f"https://api.github.com/{endpoint}" req = urllib.request.Request(url, headers={"Accept": "application/vnd.github.v3+json"}) with urllib.request.urlopen(req) as resp: return json.loads(resp.read().decode()) def fetch_docs_from_github() -> list[dict]: """Fetch all documents from the GitHub repo's docs/ folder. Returns list of {filename, content} dicts. """ tree = github_api(f"repos/{GITHUB_REPO}/git/trees/{GITHUB_BRANCH}?recursive=1") doc_files = [] for item in tree.get("tree", []): path = item["path"] if ( path.startswith(f"{DOCS_PATH}/") and item["type"] == "blob" and not path.endswith(".gitkeep") ): relative = path[len(f"{DOCS_PATH}/"):] # Fetch file content blob = github_api(f"repos/{GITHUB_REPO}/contents/{path}?ref={GITHUB_BRANCH}") if blob.get("encoding") == "base64": import base64 content = base64.b64decode(blob["content"]).decode("utf-8") else: content = blob.get("content", "") doc_files.append({"filename": relative, "content": content}) return doc_files def chunk_text(text: str, filename: str) -> list[dict]: """Split text into overlapping chunks for better retrieval.""" chunks = [] start = 0 chunk_index = 0 while start < len(text): end = start + CHUNK_SIZE chunk = text[start:end] if chunk.strip(): chunks.append({ "id": f"{filename}::chunk{chunk_index}", "text": chunk, "metadata": {"source": filename, "chunk_index": chunk_index}, }) chunk_index += 1 start += CHUNK_SIZE - CHUNK_OVERLAP return chunks def content_hash(content: str) -> str: """Get hash of content string for change detection.""" return hashlib.md5(content.encode("utf-8")).hexdigest() def index_documents(collection: chromadb.Collection) -> str: """Fetch docs from GitHub and index into ChromaDB. Returns status message.""" try: doc_files = fetch_docs_from_github() except Exception as e: return f"Error fetching from GitHub: {e}" # Get existing document sources from the collection existing = collection.get(include=["metadatas"]) existing_sources = set() existing_ids = {} for i, meta in enumerate(existing["metadatas"] or []): source = meta.get("source", "") existing_sources.add(source) existing_ids.setdefault(source, []).append(existing["ids"][i]) # Track current files current_sources = set() added = 0 updated = 0 for doc in doc_files: filename = doc["filename"] content = doc["content"] current_sources.add(filename) c_hash = content_hash(content) # Check if file already indexed with same content if filename in existing_sources: source_ids = existing_ids.get(filename, []) if source_ids: existing_meta = collection.get(ids=[source_ids[0]], include=["metadatas"]) if existing_meta["metadatas"] and existing_meta["metadatas"][0].get("hash") == c_hash: continue # File unchanged, skip # File changed — remove old chunks collection.delete(ids=existing_ids[filename]) updated += 1 else: added += 1 # Chunk and add document chunks = chunk_text(content, filename) if chunks: collection.add( ids=[c["id"] for c in chunks], documents=[c["text"] for c in chunks], metadatas=[{**c["metadata"], "hash": c_hash} for c in chunks], ) # Remove deleted files removed = 0 for source in existing_sources - current_sources: ids_to_remove = existing_ids.get(source, []) if ids_to_remove: collection.delete(ids=ids_to_remove) removed += 1 return f"Indexed: {added} added, {updated} updated, {removed} removed. Total files: {len(doc_files)}." # --- Initialize ChromaDB and embedding function --- embedding_fn = OpenAIEmbeddingFunction( api_key=os.environ.get("OPENAI_API_KEY", ""), model_name="text-embedding-3-small", ) client = chromadb.PersistentClient(path=CHROMA_DIR) collection = client.get_or_create_collection( name="docs", embedding_function=embedding_fn, ) # Index documents on startup print(index_documents(collection), file=sys.stderr) # --- MCP Server --- mcp = FastMCP("generix-docs") @mcp.tool() def list_docs() -> str: """List all documents available in the GitHub documentation repo.""" try: docs = fetch_docs_from_github() except Exception as e: return f"Error fetching from GitHub: {e}" if not docs: return "No documents found." return "\n".join(d["filename"] for d in docs) @mcp.tool() def read_doc(filename: str) -> str: """Read the full contents of a specific document from the GitHub repo. Args: filename: The document filename (e.g. 'Basile.txt' or 'subfolder/doc.md') """ try: docs = fetch_docs_from_github() except Exception as e: return f"Error fetching from GitHub: {e}" for doc in docs: if doc["filename"] == filename: return doc["content"] return f"Error: Document '{filename}' not found." @mcp.tool() def search_docs(query: str, n_results: int = 5) -> str: """Semantic search across all documents. Returns the most relevant passages. Args: query: The question or topic to search for n_results: Number of results to return (default 5) """ total = collection.count() if total == 0: return "No documents indexed yet. Try running reindex_docs first." n = min(n_results, total) results = collection.query(query_texts=[query], n_results=n) if not results["documents"] or not results["documents"][0]: return f"No results found for '{query}'." output = [] for doc, meta, distance in zip( results["documents"][0], results["metadatas"][0], results["distances"][0], ): source = meta.get("source", "unknown") score = round(1 - distance, 3) # Convert distance to similarity output.append(f"--- {source} (relevance: {score}) ---\n{doc.strip()}") return "\n\n".join(output) @mcp.tool() def reindex_docs() -> str: """Re-fetch and re-index all documents from GitHub. Run this after adding or removing files.""" return index_documents(collection) if __name__ == "__main__": mcp.run(transport="stdio")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/itsphily/mcp_generix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server.py•7.26 KiB