"""Tests for hybrid search and RRF merge."""
import pytest
from jons_mcp_imessage.search import rrf_merge, hybrid_search, SearchMode
class TestRRFMerge:
"""Tests for Reciprocal Rank Fusion merge algorithm."""
def test_rrf_merge_basic(self):
"""Test basic RRF merging with overlapping results."""
fts_results = [
{"rowid": 1, "text": "hello world", "rank": -2.5},
{"rowid": 2, "text": "world news", "rank": -1.5},
{"rowid": 3, "text": "hello there", "rank": -1.0},
]
vec_results = [
{"rowid": 2, "text": "world news", "similarity": 0.95},
{"rowid": 4, "text": "greetings", "similarity": 0.85},
{"rowid": 1, "text": "hello world", "similarity": 0.75},
]
merged = rrf_merge(fts_results, vec_results, k=60)
# Should have 4 unique results
assert len(merged) == 4
# Verify rowids are unique
rowids = [r["rowid"] for r in merged]
assert len(rowids) == len(set(rowids))
# Verify results are sorted by RRF score descending
scores = [r["rrf_score"] for r in merged]
assert scores == sorted(scores, reverse=True)
# rowid=2 should have highest score (appears in both, high ranks)
assert merged[0]["rowid"] == 2
assert merged[0]["keyword_rank"] == 2
assert merged[0]["semantic_rank"] == 1
# Verify RRF score calculation for rowid=2
# rank 2 in FTS: 1/(60+2) = 0.0161...
# rank 1 in vector: 1/(60+1) = 0.0164...
# total: ~0.0325
assert abs(merged[0]["rrf_score"] - 0.0325) < 0.0001
def test_rrf_merge_preserves_metadata(self):
"""Test that RRF merge preserves metadata from original results."""
fts_results = [
{
"rowid": 1,
"text": "hello world",
"rank": -2.5,
"snippet": "hello <b>world</b>",
},
]
vec_results = [
{"rowid": 1, "text": "hello world", "similarity": 0.95},
]
merged = rrf_merge(fts_results, vec_results)
assert len(merged) == 1
result = merged[0]
# Should preserve snippet from FTS
assert result["snippet"] == "hello <b>world</b>"
# Should preserve scores from both
assert result["keyword_score"] == -2.5
assert result["semantic_score"] == 0.95
# Should have rank metadata
assert result["keyword_rank"] == 1
assert result["semantic_rank"] == 1
def test_rrf_merge_empty_fts(self):
"""Test RRF merge with empty FTS results."""
vec_results = [
{"rowid": 1, "text": "hello", "similarity": 0.95},
{"rowid": 2, "text": "world", "similarity": 0.85},
]
merged = rrf_merge([], vec_results)
assert len(merged) == 2
assert merged[0]["rowid"] == 1 # Higher similarity first
assert merged[0]["semantic_rank"] == 1
assert merged[0]["keyword_rank"] is None
def test_rrf_merge_empty_vector(self):
"""Test RRF merge with empty vector results."""
fts_results = [
{"rowid": 1, "text": "hello", "rank": -2.0},
{"rowid": 2, "text": "world", "rank": -1.0},
]
merged = rrf_merge(fts_results, [])
assert len(merged) == 2
assert merged[0]["rowid"] == 1 # First in FTS results
assert merged[0]["keyword_rank"] == 1
assert merged[0]["semantic_rank"] is None
def test_rrf_merge_both_empty(self):
"""Test RRF merge with both result sets empty."""
merged = rrf_merge([], [])
assert len(merged) == 0
def test_rrf_merge_deterministic_ordering(self):
"""Test that results with same RRF score are ordered by rowid."""
# Create a scenario where two results have identical RRF scores:
# Both appear only in FTS at the same position (impossible in practice)
# OR use a contrived example with vector search only
vec_results = [
{"rowid": 5, "text": "a", "similarity": 0.9},
{"rowid": 2, "text": "b", "similarity": 0.9},
]
# These both have rank 1 and 2, so different scores
# Let's try a different approach: results only in one list at same rank
# Actually, results at same position in FTS will have same RRF score
# if they don't appear in vector results
fts_results = [
{"rowid": 5, "text": "a", "rank": -1.0},
{"rowid": 2, "text": "b", "rank": -1.0},
]
# Still different positions means different ranks (1 vs 2)
# The only way to get same RRF score is complex positioning across both lists
# Let's construct: rowid=5 at FTS rank 1, vec rank 2
# rowid=2 at FTS rank 2, vec rank 1
# Score for rowid=5: 1/(60+1) + 1/(60+2) = 1/61 + 1/62
# Score for rowid=2: 1/(60+2) + 1/(60+1) = 1/62 + 1/61
# These are equal!
fts_results = [
{"rowid": 5, "text": "a", "rank": -1.0},
{"rowid": 2, "text": "b", "rank": -1.5},
]
vec_results = [
{"rowid": 2, "text": "b", "similarity": 0.95},
{"rowid": 5, "text": "a", "similarity": 0.85},
]
merged = rrf_merge(fts_results, vec_results)
# Both have same RRF score, should be ordered by rowid ASC
assert merged[0]["rowid"] == 2
assert merged[1]["rowid"] == 5
# Verify scores are equal
assert abs(merged[0]["rrf_score"] - merged[1]["rrf_score"]) < 0.0001
def test_rrf_merge_uses_1_based_ranks(self):
"""Test that RRF uses 1-based ranks (not 0-based)."""
fts_results = [
{"rowid": 1, "text": "first", "rank": -1.0},
]
vec_results = []
merged = rrf_merge(fts_results, vec_results, k=60)
# With 1-based ranking: 1/(60+1) = 1/61 ≈ 0.0164
# With 0-based ranking: 1/(60+0) = 1/60 ≈ 0.0167 (wrong!)
expected_score = 1.0 / 61.0
assert abs(merged[0]["rrf_score"] - expected_score) < 0.0001
def test_rrf_merge_custom_k(self):
"""Test RRF merge with custom k parameter."""
fts_results = [{"rowid": 1, "text": "test", "rank": -1.0}]
vec_results = []
# Test with k=10
merged = rrf_merge(fts_results, vec_results, k=10)
expected_score = 1.0 / 11.0 # 1/(10+1)
assert abs(merged[0]["rrf_score"] - expected_score) < 0.0001
class TestHybridSearch:
"""Tests for hybrid search orchestration."""
def test_hybrid_search_keyword_mode(self):
"""Test hybrid search in keyword-only mode."""
def mock_fts_search(query, limit):
return [
{"rowid": 1, "text": f"result for {query}", "rank": -1.0},
]
def mock_vec_search(query, limit):
return [
{"rowid": 2, "text": f"vec result for {query}", "similarity": 0.9},
]
result = hybrid_search(
"test query",
mode=SearchMode.KEYWORD,
limit=10,
offset=0,
fts5_searcher=mock_fts_search,
vector_searcher=mock_vec_search,
)
assert result["mode"] == "keyword"
assert len(result["results"]) == 1
assert result["results"][0]["rowid"] == 1
def test_hybrid_search_semantic_mode(self):
"""Test hybrid search in semantic-only mode."""
def mock_fts_search(query, limit):
return [
{"rowid": 1, "text": f"result for {query}", "rank": -1.0},
]
def mock_vec_search(query, limit):
return [
{"rowid": 2, "text": f"vec result for {query}", "similarity": 0.9},
]
result = hybrid_search(
"test query",
mode=SearchMode.SEMANTIC,
limit=10,
offset=0,
fts5_searcher=mock_fts_search,
vector_searcher=mock_vec_search,
)
assert result["mode"] == "semantic"
assert len(result["results"]) == 1
assert result["results"][0]["rowid"] == 2
def test_hybrid_search_hybrid_mode(self):
"""Test hybrid search in hybrid mode (RRF merge)."""
def mock_fts_search(query, limit):
return [
{"rowid": 1, "text": "keyword result", "rank": -1.0},
]
def mock_vec_search(query, limit):
return [
{"rowid": 1, "text": "keyword result", "similarity": 0.9},
{"rowid": 2, "text": "semantic result", "similarity": 0.8},
]
result = hybrid_search(
"test query",
mode=SearchMode.HYBRID,
limit=10,
offset=0,
fts5_searcher=mock_fts_search,
vector_searcher=mock_vec_search,
)
assert result["mode"] == "hybrid"
assert len(result["results"]) == 2
# rowid=1 appears in both, should have higher RRF score
assert result["results"][0]["rowid"] == 1
assert "rrf_score" in result["results"][0]
def test_hybrid_search_pagination(self):
"""Test hybrid search pagination."""
def mock_fts_search(query, limit):
return [{"rowid": i, "text": f"result {i}", "rank": -1.0} for i in range(1, 21)]
result = hybrid_search(
"test",
mode=SearchMode.KEYWORD,
limit=5,
offset=0,
fts5_searcher=mock_fts_search,
)
assert len(result["results"]) == 5
assert result["pagination"]["total"] == 20
assert result["pagination"]["limit"] == 5
assert result["pagination"]["offset"] == 0
assert result["pagination"]["has_more"] is True
assert result["pagination"]["next_offset"] == 5
def test_hybrid_search_pagination_offset(self):
"""Test hybrid search with offset."""
def mock_fts_search(query, limit):
return [{"rowid": i, "text": f"result {i}", "rank": -1.0} for i in range(1, 21)]
result = hybrid_search(
"test",
mode=SearchMode.KEYWORD,
limit=5,
offset=15,
fts5_searcher=mock_fts_search,
)
assert len(result["results"]) == 5
assert result["pagination"]["offset"] == 15
assert result["pagination"]["has_more"] is False
assert result["pagination"]["next_offset"] is None
def test_hybrid_search_no_searchers(self):
"""Test hybrid search with no searchers provided."""
result = hybrid_search(
"test",
mode=SearchMode.HYBRID,
limit=10,
offset=0,
)
assert len(result["results"]) == 0
assert result["pagination"]["total"] == 0