MCP Data Server

Overview Schema Related Servers Score Discussions

McpServer
src
mcp_data_server

index.py•4.97 KiB

from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import Any, List import json import importlib import numpy as np from sentence_transformers import SentenceTransformer from .config import get_settings from .loaders import list_docs, yield_file_texts from .chunking import chunk_text META_JSON = "meta.json" EMB_NPY = "embeddings.npy" FAISS_IDX = "index.faiss" @dataclass class ChunkMeta: path: str chunk_index: int def _embed_texts( model: SentenceTransformer, texts: List[str], *, quiet: bool = False ) -> np.ndarray: # Disable progress bar when quiet (needed for MCP stdio so we don't print to stdout) embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=not quiet) return np.asarray(embs, dtype=np.float32) def build_index(*, quiet: bool = False) -> None: """Build/rebuild the vector index. When quiet=True, avoid stdout prints (safe for MCP stdio).""" s = get_settings() if not quiet: print(f"[info] indexing folder: {s.data_dir}") files = list_docs(s.data_dir) if not files: if not quiet: print("[warn] no supported files found under DATA_DIR") return model = SentenceTransformer(s.embed_model) all_chunks: List[str] = [] metas: List[ChunkMeta] = [] for p, text in yield_file_texts(files): if not text.strip(): continue chunks = chunk_text(text, s.chunk_size, s.chunk_overlap) for i, ch in enumerate(chunks): all_chunks.append(ch) metas.append(ChunkMeta(path=str(p), chunk_index=i)) if not all_chunks: if not quiet: print("[warn] no text extracted from files") return emb = _embed_texts(model, all_chunks, quiet=quiet) np.save(Path(s.index_dir, EMB_NPY), emb) with Path(s.index_dir, META_JSON).open("w", encoding="utf-8") as f: json.dump([m.__dict__ for m in metas], f, ensure_ascii=False, indent=2) # Optional FAISS index (inner product over normalized embeddings == cosine) try: faiss: Any = importlib.import_module("faiss") except Exception: faiss = None if s.use_faiss and faiss is not None: d = emb.shape[1] index = faiss.IndexFlatIP(d) index.add(emb) faiss.write_index(index, str(Path(s.index_dir, FAISS_IDX))) if not quiet: print(f"[info] faiss index written to {Path(s.index_dir, FAISS_IDX)}") else: if not quiet: print("[info] FAISS disabled or unavailable; will use NumPy similarity.") if not quiet: print(f"[ok] indexed {len(all_chunks)} chunks from {len(files)} files") def _load_index() -> tuple[np.ndarray, List[ChunkMeta], bool]: s = get_settings() emb = np.load(Path(s.index_dir, EMB_NPY)) metas_json = json.loads(Path(s.index_dir, META_JSON).read_text(encoding="utf-8")) metas = [ChunkMeta(**m) for m in metas_json] faiss_ok = False if s.use_faiss and Path(s.index_dir, FAISS_IDX).exists(): try: importlib.import_module("faiss") faiss_ok = True except Exception: faiss_ok = False return emb, metas, faiss_ok def search(query: str, k: int | None = None) -> List[tuple[ChunkMeta, float, str]]: s = get_settings() k = k or s.top_k emb, metas, faiss_ok = _load_index() n = emb.shape[0] if n == 0: return [] # Never ask for more results than we have vectors k = min(k, n) model = SentenceTransformer(s.embed_model) q = model.encode([query], normalize_embeddings=True) qv = np.asarray(q, dtype=np.float32) if faiss_ok: faiss: Any = importlib.import_module("faiss") index = faiss.read_index(str(Path(s.index_dir, FAISS_IDX))) scores, idxs = index.search( qv, k ) # inner product == cosine for normalized embeddings idxs0 = idxs[0].tolist() scores0 = scores[0].tolist() # Filter FAISS padding: idx == -1 or sentinel distances pairs = [ (int(i), float(sv)) for i, sv in zip(idxs0, scores0) if int(i) != -1 and np.isfinite(sv) and sv > -1e38 ] else: # cosine with normalized embeddings via dot product sims = (emb @ qv.T).squeeze() # shape (N,) order = np.argsort(sims)[::-1][:k] pairs = [(int(i), float(sims[int(i)])) for i in order] # Recompute chunk text for display (keeps index lightweight) results: List[tuple[ChunkMeta, float, str]] = [] from .loaders import read_doc # local import to avoid cycles at module import time from .chunking import chunk_text as _chunk for i, score in pairs: meta = metas[i] path = Path(meta.path) txt = read_doc(path) chunks = _chunk(txt, s.chunk_size, s.chunk_overlap) chunk_text_value = ( chunks[meta.chunk_index] if meta.chunk_index < len(chunks) else "" ) results.append((meta, score, chunk_text_value)) return results

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hkonda015/McpServer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.py•4.97 KiB