"""Hybrid search implementation using Reciprocal Rank Fusion (RRF).
This module implements hybrid search by combining results from:
- FTS5 full-text search (keyword matching)
- Vector similarity search (semantic matching)
The RRF algorithm merges rankings from different search methods into a unified score.
"""
from collections.abc import Callable
from enum import Enum
from typing import Any
class SearchMode(str, Enum):
"""Search mode selection."""
HYBRID = "hybrid"
KEYWORD = "keyword"
SEMANTIC = "semantic"
def rrf_merge(
fts5_results: list[dict[str, Any]],
vector_results: list[dict[str, Any]],
k: int = 60,
) -> list[dict[str, Any]]:
"""Merge FTS5 and vector search results using Reciprocal Rank Fusion.
RRF formula: RRF_score = sum(1 / (k + rank))
where rank is 1-based (first result has rank=1).
The merged results are sorted by (RRF_score DESC, rowid ASC) for
deterministic ordering, which is critical for pagination consistency.
Args:
fts5_results: Results from FTS5 full-text search, ordered by relevance.
Each result must have a 'rowid' field.
vector_results: Results from vector similarity search, ordered by similarity.
Each result must have a 'rowid' field.
k: RRF constant (default: 60, which is standard in literature).
Returns:
Merged results sorted by RRF score (descending), then rowid (ascending).
Each result includes:
- All original fields from the source result
- rrf_score: The computed RRF score
- keyword_rank: 1-based rank in keyword results (None if not found)
- semantic_rank: 1-based rank in vector results (None if not found)
- keyword_score: Original keyword relevance score if available
- semantic_score: Original similarity score if available
Example:
>>> fts_results = [
... {"rowid": 1, "text": "hello", "rank": -2.5},
... {"rowid": 2, "text": "world", "rank": -1.5},
... ]
>>> vec_results = [
... {"rowid": 2, "text": "world", "similarity": 0.95},
... {"rowid": 3, "text": "foo", "similarity": 0.85},
... ]
>>> merged = rrf_merge(fts_results, vec_results, k=60)
>>> # rowid 2 appears in both, gets highest RRF score
>>> # Results sorted by RRF score DESC, rowid ASC
"""
# Build rank maps: rowid -> 1-based rank
# CRITICAL: Use 1-based ranking (not 0-based) for correct RRF scoring
keyword_ranks = {
result["rowid"]: rank + 1 # rank + 1 converts 0-based index to 1-based rank
for rank, result in enumerate(fts5_results)
}
semantic_ranks = {
result["rowid"]: rank + 1 # rank + 1 converts 0-based index to 1-based rank
for rank, result in enumerate(vector_results)
}
# Build maps for original results to preserve metadata
keyword_results_map = {result["rowid"]: result for result in fts5_results}
semantic_results_map = {result["rowid"]: result for result in vector_results}
# Get all unique rowids from both result sets
all_rowids = set(keyword_ranks.keys()) | set(semantic_ranks.keys())
# Compute RRF scores for all results
merged_results = []
for rowid in all_rowids:
# Calculate RRF score as sum of reciprocal ranks
rrf_score = 0.0
keyword_rank = keyword_ranks.get(rowid)
if keyword_rank is not None:
rrf_score += 1.0 / (k + keyword_rank)
semantic_rank = semantic_ranks.get(rowid)
if semantic_rank is not None:
rrf_score += 1.0 / (k + semantic_rank)
# Get the original result data (prefer keyword, fallback to semantic)
original_result = keyword_results_map.get(
rowid, semantic_results_map.get(rowid, {})
)
# Create merged result with all original fields plus RRF metadata
merged_result = {
**original_result, # Preserve all original fields
"rowid": rowid, # Ensure rowid is present
"rrf_score": rrf_score,
"keyword_rank": keyword_rank,
"semantic_rank": semantic_rank,
}
# Add original scores if available (preserving metadata)
if rowid in keyword_results_map:
# FTS5 uses 'rank' field (negative values, more negative = less relevant)
keyword_result = keyword_results_map[rowid]
if "rank" in keyword_result:
merged_result["keyword_score"] = keyword_result["rank"]
if "snippet" in keyword_result:
merged_result["snippet"] = keyword_result["snippet"]
if rowid in semantic_results_map:
# Vector search uses 'similarity' field (0-1, higher = more similar)
semantic_result = semantic_results_map[rowid]
if "similarity" in semantic_result:
merged_result["semantic_score"] = semantic_result["similarity"]
elif "distance" in semantic_result:
merged_result["semantic_distance"] = semantic_result["distance"]
merged_results.append(merged_result)
# Sort by (RRF score DESC, rowid ASC) for deterministic pagination
# This ensures that results with the same score are always ordered the same way
merged_results.sort(key=lambda x: (-x["rrf_score"], x["rowid"]))
return merged_results
def hybrid_search(
query: str,
mode: SearchMode = SearchMode.HYBRID,
limit: int = 50,
offset: int = 0,
k: int = 60,
fts5_searcher: Callable[[str, int], list[dict[str, Any]]] | None = None,
vector_searcher: Callable[[str, int], list[dict[str, Any]]] | None = None,
) -> dict[str, Any]:
"""Perform hybrid search combining keyword and semantic search.
This function orchestrates the search process based on the selected mode:
- HYBRID: Runs both FTS5 and vector search, merges with RRF
- KEYWORD: Runs only FTS5 full-text search
- SEMANTIC: Runs only vector similarity search
Args:
query: Search query string.
mode: Search mode (hybrid, keyword, or semantic).
limit: Maximum number of results to return.
offset: Number of results to skip (for pagination).
k: RRF constant for hybrid mode (default: 60).
fts5_searcher: Optional callable that performs FTS5 search.
Should take (query, limit) and return list of results.
vector_searcher: Optional callable that performs vector search.
Should take (query, limit) and return list of results.
Returns:
Dictionary with:
- results: List of search results (paginated)
- pagination: Pagination metadata (total, offset, limit, has_more, next_offset)
- mode: Search mode used
- query: Original query string
Example:
>>> def my_fts_search(q, lim):
... # Your FTS5 implementation
... return [...]
>>>
>>> def my_vec_search(q, lim):
... # Your vector search implementation
... return [...]
>>>
>>> results = hybrid_search(
... "hello world",
... mode=SearchMode.HYBRID,
... limit=10,
... offset=0,
... fts5_searcher=my_fts_search,
... vector_searcher=my_vec_search,
... )
"""
# Determine which search methods to run
run_fts5 = mode in (SearchMode.HYBRID, SearchMode.KEYWORD)
run_vector = mode in (SearchMode.HYBRID, SearchMode.SEMANTIC)
# For hybrid mode, fetch more results before merging to ensure good coverage
# This accounts for the fact that different methods may rank differently
fetch_limit = limit * 3 if mode == SearchMode.HYBRID else limit + offset + 1
# Execute searches based on mode
fts5_results: list[dict[str, Any]] = []
if run_fts5 and fts5_searcher is not None:
fts5_results = fts5_searcher(query, fetch_limit)
vector_results: list[dict[str, Any]] = []
if run_vector and vector_searcher is not None:
vector_results = vector_searcher(query, fetch_limit)
# Merge results based on mode
if mode == SearchMode.HYBRID:
# Use RRF to merge both result sets
all_results = rrf_merge(fts5_results, vector_results, k=k)
elif mode == SearchMode.KEYWORD:
# Return only FTS5 results
all_results = fts5_results
elif mode == SearchMode.SEMANTIC:
# Return only vector results
all_results = vector_results
else:
# Should never happen, but handle gracefully
all_results = []
# Compute total count before applying offset
total = len(all_results)
# Apply pagination: offset and limit
paginated_results = all_results[offset : offset + limit]
# Determine if there are more results
has_more = offset + limit < total
return {
"results": paginated_results,
"pagination": {
"total": total,
"offset": offset,
"limit": limit,
"has_more": has_more,
"next_offset": offset + limit if has_more else None,
},
"mode": mode.value,
"query": query,
}