"""FTS5 full-text search with BM25 ranking for iMessage search.
This module provides keyword-based search using SQLite's FTS5 engine.
Features:
- BM25 ranking (built-in to FTS5)
- Metadata filtering (sender, chat, date, service)
- Snippet generation with highlighting
- FTS5 query syntax support (phrases, prefix*, NEAR, etc.)
"""
from dataclasses import dataclass
from typing import Any, Optional, List, Tuple
from ..db.search_index import get_search_index_connection
@dataclass
class FTSFilters:
"""Filters to apply to FTS5 search results.
All filters are optional and applied as AND conditions.
Filters are applied BEFORE the FTS5 MATCH for efficiency.
"""
sender: Optional[str] = None
chat_id: Optional[int] = None
after_date: Optional[int] = None # CoreData timestamp (inclusive)
before_date: Optional[int] = None # CoreData timestamp (inclusive)
service: Optional[str] = None # 'iMessage' or 'SMS'
@dataclass
class FTSResult:
"""Result from FTS5 search.
Attributes:
rowid: Message ROWID from chat.db
score: BM25 score (lower is better match)
snippet: Text snippet with <mark> tags around matches
"""
rowid: int
score: float
snippet: str
def fts5_search(
query: str,
filters: Optional[FTSFilters] = None,
limit: int = 100,
offset: int = 0
) -> Tuple[List[FTSResult], int]:
"""Search messages using FTS5 with BM25 ranking.
This function performs full-text search using SQLite's FTS5 engine,
which provides BM25 ranking out of the box. Results are filtered by
metadata (sender, chat, date, service) before the FTS5 MATCH for
optimal performance.
FTS5 Query Syntax:
- Phrase search: "exact phrase"
- Prefix search: word*
- Proximity: word1 NEAR word2
- Boolean: word1 AND word2, word1 OR word2, NOT word3
- Column-specific: not supported (we only index text column)
Args:
query: FTS5 query string (supports FTS5 syntax)
filters: Optional metadata filters to apply before FTS5 MATCH
limit: Maximum number of results to return
offset: Number of results to skip (for pagination)
Returns:
Tuple of (results, total_count) where:
- results: List of FTSResult objects (limited and offset applied)
- total_count: Total number of matching results (before limit/offset)
Raises:
sqlite3.OperationalError: If FTS5 query syntax is invalid
Example:
>>> # Simple keyword search
>>> results, total = fts5_search("hello world")
>>> # Phrase search with filters
>>> filters = FTSFilters(sender="+1234567890", after_date=726000000000000)
>>> results, total = fts5_search('"hello world"', filters=filters)
>>> # Prefix search with pagination
>>> results, total = fts5_search("meet*", limit=20, offset=40)
"""
if not query or not query.strip():
return [], 0
# Build WHERE clause components
where_clauses: List[str] = []
params: List[Any] = []
# Apply metadata filters BEFORE FTS5 MATCH
if filters:
if filters.sender is not None:
where_clauses.append("idx.sender = ?")
params.append(filters.sender)
if filters.chat_id is not None:
where_clauses.append("idx.chat_id = ?")
params.append(filters.chat_id)
if filters.after_date is not None:
where_clauses.append("idx.date_coredata >= ?")
params.append(filters.after_date)
if filters.before_date is not None:
where_clauses.append("idx.date_coredata <= ?")
params.append(filters.before_date)
if filters.service is not None:
where_clauses.append("idx.service = ?")
params.append(filters.service)
# Add FTS5 MATCH clause - must use table name not alias for FTS5 functions
where_clauses.append("message_fts MATCH ?")
params.append(query)
# Combine all WHERE conditions
where_clause = " AND ".join(where_clauses)
# Build SQL query
# - FTS5 auxiliary functions (bm25, highlight) require the FTS table to be
# the first table in FROM and to be referenced by actual table name
# - JOIN with message_index (idx) for metadata filtering
# - Use bm25(message_fts) for BM25 ranking (lower score = better match)
# - Use highlight() for snippet generation with <mark> tags
# - NOTE: Cannot use COUNT(*) OVER() with bm25() in same query due to SQLite limitation
# We run a separate count query instead
sql = f"""
SELECT
message_fts.rowid,
bm25(message_fts) as score,
highlight(message_fts, 0, '<mark>', '</mark>') as snippet
FROM message_fts
INNER JOIN message_index idx ON message_fts.rowid = idx.rowid
WHERE {where_clause}
ORDER BY score ASC
LIMIT ? OFFSET ?
"""
# Count query - separate from main query due to SQLite limitation
count_sql = f"""
SELECT COUNT(*)
FROM message_fts
INNER JOIN message_index idx ON message_fts.rowid = idx.rowid
WHERE {where_clause}
"""
# Add limit and offset to params (for main query only)
main_params = params + [limit, offset]
count_params = params # Count query doesn't need limit/offset
# Execute queries
with get_search_index_connection() as conn:
# Get total count first
count_cursor = conn.execute(count_sql, count_params)
total_count = count_cursor.fetchone()[0]
# Get paginated results
cursor = conn.execute(sql, main_params)
rows = cursor.fetchall()
# Parse results
results = []
for row in rows:
results.append(FTSResult(
rowid=row[0],
score=row[1],
snippet=row[2]
))
return results, total_count