from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List
import json
import importlib
import numpy as np
from sentence_transformers import SentenceTransformer
from .config import get_settings
from .loaders import list_docs, yield_file_texts
from .chunking import chunk_text
META_JSON = "meta.json"
EMB_NPY = "embeddings.npy"
FAISS_IDX = "index.faiss"
@dataclass
class ChunkMeta:
path: str
chunk_index: int
def _embed_texts(
model: SentenceTransformer, texts: List[str], *, quiet: bool = False
) -> np.ndarray:
# Disable progress bar when quiet (needed for MCP stdio so we don't print to stdout)
embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=not quiet)
return np.asarray(embs, dtype=np.float32)
def build_index(*, quiet: bool = False) -> None:
"""Build/rebuild the vector index. When quiet=True, avoid stdout prints (safe for MCP stdio)."""
s = get_settings()
if not quiet:
print(f"[info] indexing folder: {s.data_dir}")
files = list_docs(s.data_dir)
if not files:
if not quiet:
print("[warn] no supported files found under DATA_DIR")
return
model = SentenceTransformer(s.embed_model)
all_chunks: List[str] = []
metas: List[ChunkMeta] = []
for p, text in yield_file_texts(files):
if not text.strip():
continue
chunks = chunk_text(text, s.chunk_size, s.chunk_overlap)
for i, ch in enumerate(chunks):
all_chunks.append(ch)
metas.append(ChunkMeta(path=str(p), chunk_index=i))
if not all_chunks:
if not quiet:
print("[warn] no text extracted from files")
return
emb = _embed_texts(model, all_chunks, quiet=quiet)
np.save(Path(s.index_dir, EMB_NPY), emb)
with Path(s.index_dir, META_JSON).open("w", encoding="utf-8") as f:
json.dump([m.__dict__ for m in metas], f, ensure_ascii=False, indent=2)
# Optional FAISS index (inner product over normalized embeddings == cosine)
try:
faiss: Any = importlib.import_module("faiss")
except Exception:
faiss = None
if s.use_faiss and faiss is not None:
d = emb.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb)
faiss.write_index(index, str(Path(s.index_dir, FAISS_IDX)))
if not quiet:
print(f"[info] faiss index written to {Path(s.index_dir, FAISS_IDX)}")
else:
if not quiet:
print("[info] FAISS disabled or unavailable; will use NumPy similarity.")
if not quiet:
print(f"[ok] indexed {len(all_chunks)} chunks from {len(files)} files")
def _load_index() -> tuple[np.ndarray, List[ChunkMeta], bool]:
s = get_settings()
emb = np.load(Path(s.index_dir, EMB_NPY))
metas_json = json.loads(Path(s.index_dir, META_JSON).read_text(encoding="utf-8"))
metas = [ChunkMeta(**m) for m in metas_json]
faiss_ok = False
if s.use_faiss and Path(s.index_dir, FAISS_IDX).exists():
try:
importlib.import_module("faiss")
faiss_ok = True
except Exception:
faiss_ok = False
return emb, metas, faiss_ok
def search(query: str, k: int | None = None) -> List[tuple[ChunkMeta, float, str]]:
s = get_settings()
k = k or s.top_k
emb, metas, faiss_ok = _load_index()
n = emb.shape[0]
if n == 0:
return []
# Never ask for more results than we have vectors
k = min(k, n)
model = SentenceTransformer(s.embed_model)
q = model.encode([query], normalize_embeddings=True)
qv = np.asarray(q, dtype=np.float32)
if faiss_ok:
faiss: Any = importlib.import_module("faiss")
index = faiss.read_index(str(Path(s.index_dir, FAISS_IDX)))
scores, idxs = index.search(
qv, k
) # inner product == cosine for normalized embeddings
idxs0 = idxs[0].tolist()
scores0 = scores[0].tolist()
# Filter FAISS padding: idx == -1 or sentinel distances
pairs = [
(int(i), float(sv))
for i, sv in zip(idxs0, scores0)
if int(i) != -1 and np.isfinite(sv) and sv > -1e38
]
else:
# cosine with normalized embeddings via dot product
sims = (emb @ qv.T).squeeze() # shape (N,)
order = np.argsort(sims)[::-1][:k]
pairs = [(int(i), float(sims[int(i)])) for i in order]
# Recompute chunk text for display (keeps index lightweight)
results: List[tuple[ChunkMeta, float, str]] = []
from .loaders import read_doc # local import to avoid cycles at module import time
from .chunking import chunk_text as _chunk
for i, score in pairs:
meta = metas[i]
path = Path(meta.path)
txt = read_doc(path)
chunks = _chunk(txt, s.chunk_size, s.chunk_overlap)
chunk_text_value = (
chunks[meta.chunk_index] if meta.chunk_index < len(chunks) else ""
)
results.append((meta, score, chunk_text_value))
return results