"""Tests for BM25 indexer."""
import pytest
from mcp_server_builder.utils.indexer import (
Doc,
IndexSearch,
_enhanced_tokenize,
_generate_ngrams,
)
class TestTokenization:
"""Tests for tokenization functions."""
def test_basic_tokenization(self) -> None:
"""Test basic word tokenization."""
tokens = _enhanced_tokenize("hello world")
assert "hello" in tokens or "hello" in list(tokens)
assert "world" in tokens or "world" in list(tokens)
def test_stopword_removal(self) -> None:
"""Test that stop words are removed."""
tokens = _enhanced_tokenize("the quick brown fox")
assert "the" not in tokens
# "quick", "brown", "fox" should be stemmed but present
def test_camelcase_splitting(self) -> None:
"""Test CamelCase is split into parts."""
# FastMCP is preserved as "fastmcp" since it's in PRESERVE_TERMS
tokens = _enhanced_tokenize("FastMCP")
assert "fastmcp" in tokens
# Test with a non-preserved CamelCase term
tokens2 = _enhanced_tokenize("AgentCore")
# Should be split and stemmed: "agent" and "core"
assert any("agent" in t for t in tokens2) or any(
"agentcor" in t for t in tokens2
)
def test_preserve_terms(self) -> None:
"""Test domain-specific terms are preserved unstemmed."""
tokens = _enhanced_tokenize("mcp json rpc stdio")
assert "mcp" in tokens
assert "json" in tokens
assert "rpc" in tokens
assert "stdio" in tokens
def test_stemming(self) -> None:
"""Test that stemming is applied."""
tokens = _enhanced_tokenize("running tools transports")
# "running" -> "run", "tools" -> "tool", "transports" -> "transport"
assert "run" in tokens or "running" in tokens
assert "tool" in tokens or "tools" in tokens
class TestNgrams:
"""Tests for n-gram generation."""
def test_bigrams(self) -> None:
"""Test bigram generation."""
tokens = ["hello", "world", "test"]
bigrams = _generate_ngrams(tokens, 2)
assert "hello_world" in bigrams
assert "world_test" in bigrams
assert len(bigrams) == 2
def test_trigrams(self) -> None:
"""Test trigram generation."""
tokens = ["a", "b", "c", "d"]
trigrams = _generate_ngrams(tokens, 3)
assert "a_b_c" in trigrams
assert "b_c_d" in trigrams
assert len(trigrams) == 2
def test_empty_ngrams(self) -> None:
"""Test n-grams with insufficient tokens."""
tokens = ["single"]
bigrams = _generate_ngrams(tokens, 2)
assert bigrams == []
class TestIndexSearch:
"""Tests for IndexSearch class."""
@pytest.fixture
def sample_index(self) -> IndexSearch:
"""Create a sample index with test documents."""
index = IndexSearch()
index.add(
Doc(
uri="https://example.com/tools",
display_title="MCP Tools Guide",
content="Tools are the primary way for MCP servers to expose functionality.",
index_title="MCP Tools Guide",
)
)
index.add(
Doc(
uri="https://example.com/resources",
display_title="Resources Documentation",
content="Resources provide read-only data access to clients.",
index_title="Resources Documentation",
)
)
index.add(
Doc(
uri="https://example.com/prompts",
display_title="Prompts Reference",
content="Prompts are user-controlled templates for common operations.",
index_title="Prompts Reference",
)
)
return index
def test_add_document(self) -> None:
"""Test adding a document to the index."""
index = IndexSearch()
doc = Doc(
uri="https://test.com",
display_title="Test",
content="Test content",
index_title="Test",
)
result = index.add(doc)
assert result is index # Returns self for chaining
assert len(index.docs) == 1
assert index.docs[0] == doc
def test_search_returns_results(self, sample_index: IndexSearch) -> None:
"""Test that search returns relevant results."""
results = sample_index.search("tools functionality")
assert len(results) > 0
# First result should be the tools document
score, doc = results[0]
assert "tools" in doc.uri.lower() or "tool" in doc.display_title.lower()
def test_search_empty_query(self, sample_index: IndexSearch) -> None:
"""Test search with empty query."""
results = sample_index.search("")
# Should return empty or handle gracefully
assert isinstance(results, list)
def test_search_no_matches(self, sample_index: IndexSearch) -> None:
"""Test search with no matching terms."""
results = sample_index.search("xyznonexistent123")
assert results == []
def test_search_respects_k(self, sample_index: IndexSearch) -> None:
"""Test that search respects the k parameter."""
results = sample_index.search("documentation", k=1)
assert len(results) <= 1
def test_empty_index_search(self) -> None:
"""Test search on empty index."""
index = IndexSearch()
results = index.search("anything")
assert results == []
def test_doc_lengths_tracked(self, sample_index: IndexSearch) -> None:
"""Test that document lengths are tracked for BM25."""
assert len(sample_index.doc_lengths) == 3
assert sample_index.avg_doc_length > 0