query_engine.py•20.4 kB
"""High-Performance Query Engine for Calibre Library
Handles query execution, optimization, and result assembly
for large-scale library operations (107k+ books).
"""
import logging
import sqlite3
import time
import uuid
from typing import List, Dict, Any, Optional, Set, Tuple
from pathlib import Path
from query_builder import LibraryQuery, QueryType, SortOrder, TextSearch, MetadataFilters
from response_models import (
LibraryResponse, BookSummary, BookDetails, ContentMatch, SeriesInfo,
PaginationInfo, QueryMetadata, ResponseStatus, ResponseBuilder
)
from fuzzy_matcher import SimpleFuzzyMatcher
logger = logging.getLogger(__name__)
class QueryOptimizer:
"""Optimizes queries for performance on large datasets."""
@staticmethod
def should_use_index(query: LibraryQuery) -> Dict[str, bool]:
"""Determine which indexes to prefer for this query."""
recommendations = {
"author_index": bool(query.metadata_filters.authors),
"title_index": bool(query.metadata_filters.titles),
"series_index": bool(query.metadata_filters.series),
"date_index": any([
query.metadata_filters.published_after,
query.metadata_filters.published_before,
query.sort_order in [SortOrder.DATE_ASC, SortOrder.DATE_DESC]
]),
"format_index": bool(query.metadata_filters.formats)
}
return recommendations
@staticmethod
def estimate_result_size(query: LibraryQuery, total_books: int) -> int:
"""Estimate result set size for query planning."""
if query.book_ids:
return len(query.book_ids)
# Apply rough filtering estimates
estimated_size = total_books
# Text search typically reduces by 90-95%
if not query.text_search.is_empty():
estimated_size *= 0.1
# Author filter reduces by ~99% (assuming ~100 books per author)
if query.metadata_filters.authors:
estimated_size = min(estimated_size, len(query.metadata_filters.authors) * 100)
# Series filter is very selective
if query.metadata_filters.series:
estimated_size = min(estimated_size, len(query.metadata_filters.series) * 20)
# Format filter moderately selective
if query.metadata_filters.formats:
estimated_size *= 0.7
return max(1, int(estimated_size))
@staticmethod
def build_optimal_sql(query: LibraryQuery, total_books: int) -> Tuple[str, List, str, List]:
"""Build optimized SQL queries for count and data retrieval."""
estimated_size = QueryOptimizer.estimate_result_size(query, total_books)
use_indexes = QueryOptimizer.should_use_index(query)
# Build JOIN strategy based on query
base_from = "books b"
joins = []
where_conditions = []
count_conditions = []
params = []
count_params = []
# Add JOINs based on what we need
if (query.metadata_filters.authors or
query.sort_order in [SortOrder.AUTHOR_ASC, SortOrder.AUTHOR_DESC] or
not query.text_search.is_empty()):
joins.append("LEFT JOIN books_authors_link bal ON b.id = bal.book")
joins.append("LEFT JOIN authors a ON bal.author = a.id")
if query.metadata_filters.series or query.sort_order == SortOrder.SERIES_ORDER:
joins.append("LEFT JOIN books_series_link bsl ON b.id = bsl.book")
joins.append("LEFT JOIN series s ON bsl.series = s.id")
if query.metadata_filters.tags:
joins.append("LEFT JOIN books_tags_link btl ON b.id = btl.book")
joins.append("LEFT JOIN tags t ON btl.tag = t.id")
if query.metadata_filters.formats:
joins.append("LEFT JOIN data d ON b.id = d.book")
# Build WHERE conditions
if query.book_ids:
placeholders = ','.join(['?' for _ in query.book_ids])
where_conditions.append(f"b.id IN ({placeholders})")
count_conditions.append(f"b.id IN ({placeholders})")
params.extend(query.book_ids)
count_params.extend(query.book_ids)
# Text search conditions
if not query.text_search.is_empty():
search_terms = query.text_search.get_search_terms()
if query.text_search.fuzzy:
# Add fuzzy variations
expanded_terms = []
for term in search_terms:
expanded_terms.extend(SimpleFuzzyMatcher.expand_search_terms(term))
search_terms = list(set(expanded_terms))
text_conditions = []
for term in search_terms[:5]: # Limit to 5 terms for performance
if query.text_search.case_sensitive:
text_conditions.append("(b.title LIKE ? OR a.name LIKE ?)")
params.extend([f"%{term}%", f"%{term}%"])
count_params.extend([f"%{term}%", f"%{term}%"])
else:
text_conditions.append("(b.title LIKE ? COLLATE NOCASE OR a.name LIKE ? COLLATE NOCASE)")
params.extend([f"%{term}%", f"%{term}%"])
count_params.extend([f"%{term}%", f"%{term}%"])
if text_conditions:
combined_condition = " OR ".join(text_conditions)
where_conditions.append(f"({combined_condition})")
count_conditions.append(f"({combined_condition})")
# Metadata filter conditions
filters = query.metadata_filters
if filters.authors:
author_conditions = []
for author in filters.authors:
author_conditions.append("a.name LIKE ? COLLATE NOCASE")
params.append(f"%{author}%")
count_params.append(f"%{author}%")
if author_conditions:
where_conditions.append(f"({' OR '.join(author_conditions)})")
count_conditions.append(f"({' OR '.join(author_conditions)})")
if filters.titles:
title_conditions = []
for title in filters.titles:
title_conditions.append("b.title LIKE ? COLLATE NOCASE")
params.append(f"%{title}%")
count_params.append(f"%{title}%")
if title_conditions:
where_conditions.append(f"({' OR '.join(title_conditions)})")
count_conditions.append(f"({' OR '.join(title_conditions)})")
if filters.series:
series_conditions = []
for series in filters.series:
series_conditions.append("s.name LIKE ? COLLATE NOCASE")
params.append(f"%{series}%")
count_params.append(f"%{series}%")
if series_conditions:
where_conditions.append(f"({' OR '.join(series_conditions)})")
count_conditions.append(f"({' OR '.join(series_conditions)})")
if filters.formats:
format_conditions = []
for fmt in filters.formats:
format_conditions.append("d.format = ?")
params.append(fmt.upper())
count_params.append(fmt.upper())
if format_conditions:
where_conditions.append(f"({' OR '.join(format_conditions)})")
count_conditions.append(f"({' OR '.join(format_conditions)})")
# Date range conditions
if filters.published_after:
where_conditions.append("b.pubdate >= ?")
count_conditions.append("b.pubdate >= ?")
params.append(filters.published_after)
count_params.append(filters.published_after)
if filters.published_before:
where_conditions.append("b.pubdate <= ?")
count_conditions.append("b.pubdate <= ?")
params.append(filters.published_before)
count_params.append(filters.published_before)
# Build the queries
join_clause = " ".join(joins)
where_clause = " WHERE " + " AND ".join(where_conditions) if where_conditions else ""
count_where_clause = " WHERE " + " AND ".join(count_conditions) if count_conditions else ""
# Count query (simplified)
count_query = f"""
SELECT COUNT(DISTINCT b.id)
FROM {base_from}
{join_clause}
{count_where_clause}
"""
# Main data query
select_fields = """
b.id, b.title, b.pubdate, b.timestamp,
GROUP_CONCAT(DISTINCT a.name, ', ') as authors,
s.name as series_name, b.series_index,
GROUP_CONCAT(DISTINCT d.format, ', ') as formats
"""
# Add GROUP BY and ORDER BY
group_by = "GROUP BY b.id, b.title, b.pubdate, b.timestamp, s.name, b.series_index"
# Build ORDER BY clause
order_mapping = {
SortOrder.TITLE_ASC: "b.title ASC",
SortOrder.TITLE_DESC: "b.title DESC",
SortOrder.AUTHOR_ASC: "MIN(a.name) ASC, b.title ASC",
SortOrder.AUTHOR_DESC: "MIN(a.name) DESC, b.title ASC",
SortOrder.DATE_ASC: "b.pubdate ASC, b.title ASC",
SortOrder.DATE_DESC: "b.pubdate DESC, b.title ASC",
SortOrder.SERIES_ORDER: "s.name ASC, b.series_index ASC, b.title ASC",
SortOrder.ADDED_DESC: "b.timestamp DESC",
SortOrder.RANDOM: "RANDOM()"
}
order_by = f"ORDER BY {order_mapping.get(query.sort_order, 'b.title ASC')}"
# Add LIMIT and OFFSET
limit_clause = f"LIMIT ? OFFSET ?"
params.extend([query.pagination.limit, query.pagination.offset])
data_query = f"""
SELECT {select_fields}
FROM {base_from}
{join_clause}
{where_clause}
{group_by}
{order_by}
{limit_clause}
"""
# Debug logging
logger.debug(f"Generated count query: {count_query}")
logger.debug(f"Generated data query: {data_query}")
logger.debug(f"Query parameters: {len(params)} params")
return count_query, count_params, data_query, params
class LibraryQueryEngine:
"""High-performance query engine for the Calibre library."""
def __init__(self, db_path: Path):
"""Initialize the query engine."""
self.db_path = db_path
self.total_books = self._get_total_books()
logger.info(f"Query engine initialized for {self.total_books} books")
def _get_total_books(self) -> int:
"""Get total book count for optimization."""
try:
with sqlite3.connect(str(self.db_path)) as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM books")
return cursor.fetchone()[0]
except Exception as e:
logger.error(f"Failed to get total book count: {e}")
return 100000 # Safe default for optimization
def execute_query(self, query: LibraryQuery) -> LibraryResponse:
"""Execute a library query and return standardized response."""
start_time = time.time()
query_id = str(uuid.uuid4())[:8]
logger.debug(f"Executing query {query_id}: {query.estimate_complexity()} complexity")
try:
# Handle simple book ID lookups efficiently
if query.is_simple_lookup():
return self._execute_book_lookup(query, query_id, start_time)
# Execute general search query
return self._execute_search_query(query, query_id, start_time)
except Exception as e:
execution_time = (time.time() - start_time) * 1000
logger.error(f"Query {query_id} failed after {execution_time:.2f}ms: {e}")
return LibraryResponse.error(f"Query execution failed: {str(e)}")
def _execute_book_lookup(self, query: LibraryQuery, query_id: str, start_time: float) -> LibraryResponse:
"""Execute optimized book ID lookup."""
with sqlite3.connect(str(self.db_path)) as conn:
cursor = conn.cursor()
placeholders = ','.join(['?' for _ in query.book_ids])
# Build query based on response requirements
if query.response_options.include_full_metadata:
sql = f"""
SELECT b.id, b.title, b.pubdate, b.timestamp, b.isbn, b.publisher,
GROUP_CONCAT(DISTINCT a.name, ', ') as authors,
s.name as series_name, b.series_index,
c.text as description,
GROUP_CONCAT(DISTINCT d.format, ', ') as formats,
GROUP_CONCAT(DISTINCT t.name, ', ') as tags
FROM books b
LEFT JOIN books_authors_link bal ON b.id = bal.book
LEFT JOIN authors a ON bal.author = a.id
LEFT JOIN books_series_link bsl ON b.id = bsl.book
LEFT JOIN series s ON bsl.series = s.id
LEFT JOIN comments c ON b.id = c.book
LEFT JOIN data d ON b.id = d.book
LEFT JOIN books_tags_link btl ON b.id = btl.book
LEFT JOIN tags t ON btl.tag = t.id
WHERE b.id IN ({placeholders})
GROUP BY b.id
ORDER BY b.title
"""
else:
sql = f"""
SELECT b.id, b.title, b.pubdate,
GROUP_CONCAT(DISTINCT a.name, ', ') as authors,
s.name as series_name, b.series_index,
GROUP_CONCAT(DISTINCT d.format, ', ') as formats
FROM books b
LEFT JOIN books_authors_link bal ON b.id = bal.book
LEFT JOIN authors a ON bal.author = a.id
LEFT JOIN books_series_link bsl ON b.id = bsl.book
LEFT JOIN series s ON bsl.series = s.id
LEFT JOIN data d ON b.id = d.book
WHERE b.id IN ({placeholders})
GROUP BY b.id
ORDER BY b.title
"""
cursor.execute(sql, query.book_ids)
rows = cursor.fetchall()
execution_time = (time.time() - start_time) * 1000
# Convert to response objects
if query.response_options.include_full_metadata:
books = []
for row in rows:
(book_id, title, pubdate, timestamp, isbn, publisher, authors,
series_name, series_index, description, formats, tags) = row
series = None
if series_name:
series = SeriesInfo(name=series_name, index=series_index)
book = BookDetails(
book_id=book_id,
title=title,
authors=authors.split(', ') if authors else [],
series=series,
published_date=pubdate,
publisher=publisher,
isbn=isbn,
description=description,
formats={fmt: "" for fmt in (formats or "").split(', ') if fmt},
tags=(tags or "").split(', ')[:20] if tags else []
)
books.append(book)
query_metadata = QueryMetadata(
query_id=query_id,
execution_time_ms=execution_time,
complexity="simple",
database_queries=1,
rows_examined=len(rows)
)
return ResponseBuilder.book_details_response(books, query_metadata)
else:
books = []
for row in rows:
book_id, title, pubdate, authors, series_name, series_index, formats = row
series = None
if series_name:
series = SeriesInfo(name=series_name, index=series_index)
book = BookSummary(
book_id=book_id,
title=title,
authors=authors.split(', ') if authors else [],
series=series,
published_date=pubdate,
formats=(formats or "").split(', ') if formats else []
)
books.append(book)
pagination = PaginationInfo(
offset=0,
limit=len(books),
total_results=len(books),
has_more=False
)
query_metadata = QueryMetadata(
query_id=query_id,
execution_time_ms=execution_time,
complexity="simple",
database_queries=1,
rows_examined=len(rows)
)
return ResponseBuilder.book_search_response(books, pagination, query_metadata)
def _execute_search_query(self, query: LibraryQuery, query_id: str, start_time: float) -> LibraryResponse:
"""Execute general search query with optimization."""
with sqlite3.connect(str(self.db_path)) as conn:
cursor = conn.cursor()
# Build optimized SQL
count_query, count_params, data_query, data_params = QueryOptimizer.build_optimal_sql(
query, self.total_books
)
database_queries = 0
# Get total count
cursor.execute(count_query, count_params)
total_results = cursor.fetchone()[0]
database_queries += 1
# Get data if there are results
books = []
if total_results > 0:
cursor.execute(data_query, data_params)
rows = cursor.fetchall()
database_queries += 1
# Convert rows to BookSummary objects
for row in rows:
book_id, title, pubdate, timestamp, authors, series_name, series_index, formats = row
series = None
if series_name:
series = SeriesInfo(name=series_name, index=series_index)
book = BookSummary(
book_id=book_id,
title=title,
authors=authors.split(', ') if authors else [],
series=series,
published_date=pubdate,
formats=(formats or "").split(', ') if formats else []
)
books.append(book)
execution_time = (time.time() - start_time) * 1000
# Build response
pagination = PaginationInfo(
offset=query.pagination.offset,
limit=query.pagination.limit,
total_results=total_results,
has_more=(query.pagination.offset + query.pagination.limit) < total_results
)
optimizations = []
if query.estimate_complexity() == "complex" and execution_time < 100:
optimizations.append("query_optimization_applied")
query_metadata = QueryMetadata(
query_id=query_id,
execution_time_ms=execution_time,
complexity=query.estimate_complexity(),
database_queries=database_queries,
rows_examined=total_results,
optimizations_applied=optimizations
)
return ResponseBuilder.book_search_response(books, pagination, query_metadata)