calibre_full_text_search.py•13.2 kB
"""Calibre Full-Text Search Module
Provides full-text search functionality for Calibre library content.
Connects to the Calibre full-text search database to search within book content.
"""
import logging
import sqlite3
from typing import Dict, Any, List, Optional, Tuple
import os
import re
logger = logging.getLogger(__name__)
class CalibreFullTextSearch:
"""Full-text search functionality for Calibre library content.
Provides search capabilities across the content of books in the Calibre library
using the full-text search database generated by Calibre.
"""
def __init__(self, fts_db_path: str):
"""Initialize full-text search with database path."""
self.fts_db_path = fts_db_path
self._validate_database()
logger.info(f"Full-text search initialized with database: {fts_db_path}")
def _validate_database(self) -> None:
"""Validate that the FTS database exists and has expected structure."""
if not os.path.exists(self.fts_db_path):
raise FileNotFoundError(f"FTS database not found: {self.fts_db_path}")
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
# Check for required table
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='books_text'")
if not cursor.fetchone():
raise ValueError("books_text table not found in FTS database")
# Check table structure
cursor.execute("PRAGMA table_info(books_text)")
columns = [col[1] for col in cursor.fetchall()]
required_columns = ['book', 'searchable_text', 'format']
for col in required_columns:
if col not in columns:
raise ValueError(f"Required column '{col}' not found in books_text table")
conn.close()
except sqlite3.Error as e:
raise ValueError(f"Error validating FTS database: {e}")
def search_content(self, query: str, case_sensitive: bool = False,
max_results: int = 100, context_chars: int = 200) -> Dict[str, Any]:
"""Search for text content across all books.
Args:
query: Search query string
case_sensitive: Whether to perform case-sensitive search
max_results: Maximum number of results to return
context_chars: Number of characters to include around matches for context
Returns:
Dictionary containing search results and metadata
"""
if not query.strip():
return {
"status": "error",
"message": "Search query cannot be empty"
}
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
# Prepare search query
if case_sensitive:
search_sql = "SELECT book, format, searchable_text FROM books_text WHERE searchable_text GLOB ? LIMIT ?"
search_param = f"*{query}*"
else:
search_sql = "SELECT book, format, searchable_text FROM books_text WHERE searchable_text LIKE ? LIMIT ?"
search_param = f"%{query}%"
cursor.execute(search_sql, (search_param, max_results))
results = cursor.fetchall()
# Process results to extract context
processed_results = []
for book_id, format_type, content in results:
matches = self._extract_matches_with_context(content, query, context_chars, case_sensitive)
processed_results.append({
"book_id": book_id,
"format": format_type,
"match_count": len(matches),
"matches": matches
})
conn.close()
return {
"status": "success",
"query": query,
"case_sensitive": case_sensitive,
"total_books_searched": self._get_total_indexed_books(),
"books_with_matches": len(processed_results),
"total_matches": sum(result["match_count"] for result in processed_results),
"results": processed_results
}
except sqlite3.Error as e:
logger.error(f"Database error during search: {e}")
return {
"status": "error",
"message": f"Database error: {e}"
}
except Exception as e:
logger.error(f"Unexpected error during search: {e}")
return {
"status": "error",
"message": f"Search error: {e}"
}
def search_specific_book(self, book_id: int, query: str, case_sensitive: bool = False,
context_chars: int = 200) -> Dict[str, Any]:
"""Search for text content within a specific book.
Args:
book_id: ID of the book to search within
query: Search query string
case_sensitive: Whether to perform case-sensitive search
context_chars: Number of characters to include around matches for context
Returns:
Dictionary containing search results and metadata
"""
if not query.strip():
return {
"status": "error",
"message": "Search query cannot be empty"
}
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
# Search within specific book
if case_sensitive:
search_sql = "SELECT format, searchable_text FROM books_text WHERE book = ? AND searchable_text GLOB ?"
search_param = f"*{query}*"
else:
search_sql = "SELECT format, searchable_text FROM books_text WHERE book = ? AND searchable_text LIKE ?"
search_param = f"%{query}%"
cursor.execute(search_sql, (book_id, search_param))
results = cursor.fetchall()
if not results:
return {
"status": "success",
"book_id": book_id,
"query": query,
"case_sensitive": case_sensitive,
"formats_with_matches": 0,
"total_matches": 0,
"results": []
}
# Process results
processed_results = []
for format_type, content in results:
matches = self._extract_matches_with_context(content, query, context_chars, case_sensitive)
if matches: # Only include formats that have matches
processed_results.append({
"format": format_type,
"match_count": len(matches),
"matches": matches
})
conn.close()
return {
"status": "success",
"book_id": book_id,
"query": query,
"case_sensitive": case_sensitive,
"formats_with_matches": len(processed_results),
"total_matches": sum(result["match_count"] for result in processed_results),
"results": processed_results
}
except sqlite3.Error as e:
logger.error(f"Database error during book search: {e}")
return {
"status": "error",
"message": f"Database error: {e}"
}
except Exception as e:
logger.error(f"Unexpected error during book search: {e}")
return {
"status": "error",
"message": f"Search error: {e}"
}
def get_search_statistics(self) -> Dict[str, Any]:
"""Get statistics about the full-text search database."""
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
# Get total indexed books
cursor.execute("SELECT COUNT(DISTINCT book) FROM books_text")
total_books = cursor.fetchone()[0]
# Get total text entries (book+format combinations)
cursor.execute("SELECT COUNT(*) FROM books_text")
total_entries = cursor.fetchone()[0]
# Get format distribution
cursor.execute("SELECT format, COUNT(*) FROM books_text GROUP BY format ORDER BY COUNT(*) DESC")
format_stats = cursor.fetchall()
# Get total text size
cursor.execute("SELECT SUM(text_size) FROM books_text WHERE text_size IS NOT NULL")
total_text_size = cursor.fetchone()[0] or 0
# Get books with errors
cursor.execute("SELECT COUNT(*) FROM books_text WHERE err_msg IS NOT NULL AND err_msg != ''")
books_with_errors = cursor.fetchone()[0]
conn.close()
return {
"status": "success",
"database_path": self.fts_db_path,
"total_indexed_books": total_books,
"total_text_entries": total_entries,
"total_text_size_bytes": total_text_size,
"total_text_size_mb": round(total_text_size / (1024 * 1024), 2),
"books_with_extraction_errors": books_with_errors,
"format_distribution": [{"format": fmt, "count": count} for fmt, count in format_stats]
}
except sqlite3.Error as e:
logger.error(f"Database error getting statistics: {e}")
return {
"status": "error",
"message": f"Database error: {e}"
}
except Exception as e:
logger.error(f"Unexpected error getting statistics: {e}")
return {
"status": "error",
"message": f"Statistics error: {e}"
}
def _extract_matches_with_context(self, content: str, query: str, context_chars: int,
case_sensitive: bool) -> List[Dict[str, Any]]:
"""Extract matches with surrounding context from content."""
if not content:
return []
# Prepare regex pattern
flags = 0 if case_sensitive else re.IGNORECASE
pattern = re.escape(query)
matches = []
for match in re.finditer(pattern, content, flags):
start_pos = match.start()
end_pos = match.end()
# Calculate context boundaries
context_start = max(0, start_pos - context_chars)
context_end = min(len(content), end_pos + context_chars)
# Extract context
context = content[context_start:context_end]
# Calculate relative position within context
match_start_in_context = start_pos - context_start
match_end_in_context = end_pos - context_start
matches.append({
"match_text": match.group(),
"position": start_pos,
"context": context,
"context_start": context_start,
"context_end": context_end,
"match_start_in_context": match_start_in_context,
"match_end_in_context": match_end_in_context
})
return matches
def _get_total_indexed_books(self) -> int:
"""Get total number of indexed books."""
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(DISTINCT book) FROM books_text")
count = cursor.fetchone()[0]
conn.close()
return count
except sqlite3.Error:
return 0
def ping(self) -> Dict[str, Any]:
"""Test connection to FTS database."""
try:
conn = sqlite3.connect(self.fts_db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM books_text LIMIT 1")
conn.close()
return {
"status": "success",
"message": "FTS database connection successful",
"database_path": self.fts_db_path
}
except sqlite3.Error as e:
return {
"status": "error",
"message": f"FTS database connection failed: {e}",
"database_path": self.fts_db_path
}
except Exception as e:
return {
"status": "error",
"message": f"Unexpected error: {e}",
"database_path": self.fts_db_path
}