gnosis-mcp

test_search_quality.py•11.7 KiB

"""Search quality eval harness — measures Precision@K, MRR, and Hit Rate. Uses query-answer pairs from cases.json to score retrieval quality. Runs against a real SQLite backend populated with test documents. """ from __future__ import annotations import asyncio import json import logging from dataclasses import dataclass, field from pathlib import Path import pytest from gnosis_mcp.config import GnosisMcpConfig from gnosis_mcp.ingest import chunk_by_headings log = logging.getLogger("gnosis_mcp.eval") CASES_PATH = Path(__file__).parent / "cases.json" K = 5 # Precision@K, Hit Rate@K @dataclass class EvalResult: """Result of evaluating a single query.""" query: str expected_paths: list[str] returned_paths: list[str] precision_at_k: float reciprocal_rank: float hit: bool description: str = "" @dataclass class EvalSummary: """Aggregate metrics across all eval cases.""" results: list[EvalResult] = field(default_factory=list) @property def mean_precision(self) -> float: if not self.results: return 0.0 return sum(r.precision_at_k for r in self.results) / len(self.results) @property def mrr(self) -> float: if not self.results: return 0.0 return sum(r.reciprocal_rank for r in self.results) / len(self.results) @property def hit_rate(self) -> float: if not self.results: return 0.0 return sum(1 for r in self.results if r.hit) / len(self.results) def _is_relevant(returned_path: str, expected_patterns: list[str]) -> bool: """Check if a returned path matches any expected pattern (substring match).""" for pattern in expected_patterns: if pattern.lower() in returned_path.lower(): return True return False def _score_query( query: str, expected_paths: list[str], returned_paths: list[str], description: str = "", ) -> EvalResult: """Score a single query result.""" relevant_count = 0 first_relevant_rank = 0 for i, path in enumerate(returned_paths[:K], 1): if _is_relevant(path, expected_paths): relevant_count += 1 if first_relevant_rank == 0: first_relevant_rank = i result_count = min(K, len(returned_paths)) if returned_paths else 1 precision = relevant_count / result_count rr = 1.0 / first_relevant_rank if first_relevant_rank > 0 else 0.0 hit = first_relevant_rank > 0 return EvalResult( query=query, expected_paths=expected_paths, returned_paths=returned_paths[:K], precision_at_k=precision, reciprocal_rank=rr, hit=hit, description=description, ) # --------------------------------------------------------------------------- # Test fixture: populate a SQLite DB with sample docs # --------------------------------------------------------------------------- SAMPLE_DOCS = [ { "path": "guides/quickstart.md", "content": "# Quick Start Guide\n\nHow to install and set up gnosis-mcp.\n\n## Installation\n\npip install gnosis-mcp\n\n## Configuration\n\nSet environment variables for your backend.", "title": "Quick Start Guide", "category": "guides", }, { "path": "guides/configuration.md", "content": "# Configuration\n\nAll settings via environment variables.\n\n## Database Config\n\nGNOSIS_MCP_DATABASE_URL for PostgreSQL.\n\n## Webhook Config\n\nGNOSIS_MCP_WEBHOOK_URL to receive notifications on doc changes.", "title": "Configuration", "category": "guides", }, { "path": "architecture/backend.md", "content": "# Backend Architecture\n\nTwo backends: SQLite (default) and PostgreSQL.\n\n## PostgreSQL Backend\n\nUses asyncpg with tsvector for full-text search.\n\n## SQLite Backend\n\nUses aiosqlite with FTS5 and porter stemmer.", "title": "Backend Architecture", "category": "architecture", }, { "path": "guides/search.md", "content": "# Search Documentation\n\nSearch your docs with keyword or hybrid search.\n\n## Keyword Search\n\nFTS5 on SQLite, tsvector on PostgreSQL.\n\n## Hybrid Search\n\nCombine keyword + semantic with RRF fusion.", "title": "Search Documentation", "category": "guides", }, { "path": "guides/webhooks.md", "content": "# Webhook Notifications\n\nReceive HTTP POST notifications when documents change.\n\n## Setup\n\nSet GNOSIS_MCP_WEBHOOK_URL. Fire-and-forget POST on upsert/delete.\n\n## Payload\n\nJSON with action, path, timestamp fields.", "title": "Webhook Notifications", "category": "guides", }, ] SAMPLE_GIT_HISTORY_DOCS = [ { "path": "git-history/src/auth.py", "content": ( "# Git History: src/auth.py\n\n" "## 2026-02-18\n\n" "Author: Alice <alice@example.com>\n\n" "Refactored authentication module to use JWT tokens\n\n" "Files: src/auth.py\n\n" "## 2026-02-15\n\n" "Author: Bob <bob@example.com>\n\n" "Initial authentication with session cookies\n\n" "Files: src/auth.py" ), "title": "Git history for src/auth.py", "category": "git-history", }, { "path": "git-history/src/db.py", "content": ( "# Git History: src/db.py\n\n" "## 2026-02-20\n\n" "Author: Alice <alice@example.com>\n\n" "Fixed database connection pool exhaustion under high load\n\n" "Files: src/db.py\n\n" "## 2026-02-10\n\n" "Author: Charlie <charlie@example.com>\n\n" "Added connection pooling with asyncpg\n\n" "Files: src/db.py" ), "title": "Git history for src/db.py", "category": "git-history", }, { "path": "git-history/tests/test_search.py", "content": ( "# Git History: tests/test_search.py\n\n" "## 2026-02-19\n\n" "Author: Bob <bob@example.com>\n\n" "Added unit tests for search ranking and hybrid mode\n\n" "Files: tests/test_search.py" ), "title": "Git history for tests/test_search.py", "category": "git-history", }, { "path": "git-history/pyproject.toml", "content": ( "# Git History: pyproject.toml\n\n" "## 2026-02-21\n\n" "Author: Alice <alice@example.com>\n\n" "Bumped version to 2.0 for major release\n\n" "Files: pyproject.toml\n\n" "## 2026-02-14\n\n" "Author: Alice <alice@example.com>\n\n" "Initial project setup with hatchling build system\n\n" "Files: pyproject.toml" ), "title": "Git history for pyproject.toml", "category": "git-history", }, ] async def _build_eval_db(tmp_path: Path) -> "SqliteBackend": """Create a populated SQLite backend for eval testing.""" from gnosis_mcp.sqlite_backend import SqliteBackend db_path = tmp_path / "eval.db" cfg = GnosisMcpConfig(database_url=str(db_path)) backend = SqliteBackend(cfg) await backend.startup() await backend.init_schema() for doc in SAMPLE_DOCS + SAMPLE_GIT_HISTORY_DOCS: chunks = chunk_by_headings(doc["content"], doc["path"]) await backend.ingest_file( doc["path"], chunks, title=doc["title"], category=doc["category"], audience="all", tags=None, content_hash=None, has_tags_col=True, has_hash_col=True, ) return backend def _load_cases() -> list[dict]: """Load eval cases from JSON file.""" return json.loads(CASES_PATH.read_text()) # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- class TestEvalHarness: """Search quality evaluation tests.""" @pytest.fixture(autouse=True) def setup_backend(self, tmp_path): self.backend = asyncio.run(_build_eval_db(tmp_path)) yield asyncio.run(self.backend.shutdown()) def _search(self, query: str, category: str | None = None) -> list[str]: results = asyncio.run( self.backend.search(query, category=category, limit=K) ) return [r["file_path"] for r in results] def test_eval_cases(self): """Run all eval cases and report aggregate metrics.""" cases = _load_cases() summary = EvalSummary() for case in cases: returned = self._search(case["query"], case.get("category")) result = _score_query( query=case["query"], expected_paths=case["expected_paths"], returned_paths=returned, description=case.get("description", ""), ) summary.results.append(result) # Report print(f"\n{'='*60}") print(f"Search Quality Eval — {len(summary.results)} cases") print(f"{'='*60}") for r in summary.results: status = "PASS" if r.hit else "MISS" print( f" [{status}] {r.query!r:40s} " f"P@{K}={r.precision_at_k:.2f} RR={r.reciprocal_rank:.2f} " f"got={r.returned_paths}" ) print(f"{'='*60}") print(f" Mean Precision@{K}: {summary.mean_precision:.3f}") print(f" MRR: {summary.mrr:.3f}") print(f" Hit Rate@{K}: {summary.hit_rate:.3f}") print(f"{'='*60}") # Thresholds — adjust as search improves assert summary.hit_rate >= 0.6, ( f"Hit rate {summary.hit_rate:.2f} below threshold 0.60" ) def test_individual_search_docs(self): """Each sample doc should be findable by a distinctive keyword.""" # Use distinctive terms rather than generic titles (FTS5 needs matching tokens) queries = { "guides/quickstart.md": "quickstart install", "guides/configuration.md": "GNOSIS_MCP_DATABASE_URL", "architecture/backend.md": "backend architecture", "guides/search.md": "keyword hybrid search", "guides/webhooks.md": "webhook notification", } for doc in SAMPLE_DOCS: query = queries[doc["path"]] results = self._search(query) assert doc["path"] in results, ( f"Expected {doc['path']} in results for query {query!r}, " f"got {results}" ) def test_category_filter(self): """Category filter should only return docs from that category.""" results = asyncio.run( self.backend.search("search", category="architecture", limit=K) ) for r in results: assert r["category"] == "architecture" def test_precision_helper(self): """Test the _score_query helper directly.""" result = _score_query( query="test", expected_paths=["guides/"], returned_paths=[ "guides/quickstart.md", "architecture/backend.md", "guides/search.md", ], ) assert result.precision_at_k == pytest.approx(2 / 3) assert result.reciprocal_rank == pytest.approx(1.0) assert result.hit is True def test_miss_scoring(self): """Test scoring when nothing matches.""" result = _score_query( query="test", expected_paths=["nonexistent/"], returned_paths=["guides/quickstart.md"], ) assert result.precision_at_k == 0.0 assert result.reciprocal_rank == 0.0 assert result.hit is False

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nicholasglazer/gnosis-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_search_quality.py•11.7 KiB