AI Book Agent MCP Server

index_books.py•5.54 KiB

#!/usr/bin/env python3 """Script to index EPUB books into the vector store.""" import sys import logging from pathlib import Path import json from typing import List # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.utils.config import config from src.parsers.epub_parser import EPUBParser from src.embeddings.embeddings import EmbeddingGenerator from src.search.vector_store import VectorStore # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def index_epub_file(epub_path: Path, parser: EPUBParser, embedding_generator: EmbeddingGenerator, vector_store: VectorStore) -> bool: """ Index a single EPUB file. Args: epub_path: Path to EPUB file parser: EPUB parser instance embedding_generator: Embedding generator instance vector_store: Vector store instance Returns: True if successful, False otherwise """ try: logger.info(f"Processing: {epub_path.name}") # Parse EPUB metadata, chapters = parser.parse_epub(str(epub_path)) if not chapters: logger.warning(f"No chapters found in {epub_path.name}") return False # Save processed book data processed_file = parser.save_processed_book( metadata, chapters, config.books['processed_dir'] ) logger.info(f"Saved processed data to: {processed_file}") # Prepare chunks and metadata for embedding all_chunks = [] all_metadata = [] chunk_size = config.vector_store['chunk_size'] chunk_overlap = config.vector_store['chunk_overlap'] for chapter in chapters: # Chunk the chapter content chunks = embedding_generator.chunk_text( chapter.content, chunk_size, chunk_overlap ) # Create metadata for each chunk for chunk_idx, chunk in enumerate(chunks): chunk_metadata = { "book_id": metadata.id, "book_title": metadata.title, "author": metadata.author, "chapter_id": chapter.chapter_id, "chapter_title": chapter.title, "chunk_index": chunk_idx, "chunk_word_count": len(chunk.split()) } all_chunks.append(chunk) all_metadata.append(chunk_metadata) logger.info(f"Created {len(all_chunks)} chunks from {len(chapters)} chapters") # Generate embeddings logger.info("Generating embeddings...") embeddings = embedding_generator.embed_texts(all_chunks, show_progress=True) # Add to vector store logger.info("Adding to vector store...") vector_store.add_book_chunks( book_id=metadata.id, chunks=all_chunks, embeddings=embeddings, metadata_list=all_metadata ) logger.info(f"Successfully indexed {epub_path.name}") logger.info(f" - Book ID: {metadata.id}") logger.info(f" - Chapters: {len(chapters)}") logger.info(f" - Chunks: {len(all_chunks)}") logger.info(f" - Total words: {sum(ch.word_count for ch in chapters)}") return True except Exception as e: logger.error(f"Error indexing {epub_path.name}: {e}") return False def main(): """Main indexing function.""" # Initialize components logger.info("Initializing components...") try: # Initialize embedding generator embedding_generator = EmbeddingGenerator( model_name=config.embeddings['model'], device=config.embeddings['device'], cache_dir=config.embeddings['cache_dir'] ) # Initialize vector store vector_store = VectorStore( persist_directory=config.books['index_dir'], collection_name=config.vector_store['collection_name'] ) # Initialize parser parser = EPUBParser() except Exception as e: logger.error(f"Failed to initialize components: {e}") sys.exit(1) # Get list of EPUB files epub_dir = Path(config.books['data_dir']) if not epub_dir.exists(): logger.error(f"EPUB directory not found: {epub_dir}") sys.exit(1) epub_files = list(epub_dir.glob("*.epub")) if not epub_files: logger.warning(f"No EPUB files found in {epub_dir}") sys.exit(0) logger.info(f"Found {len(epub_files)} EPUB files to process") # Process each file successful = 0 failed = 0 for epub_file in epub_files: logger.info(f"\n{'='*50}") if index_epub_file(epub_file, parser, embedding_generator, vector_store): successful += 1 else: failed += 1 # Print summary logger.info(f"\n{'='*50}") logger.info("INDEXING COMPLETE") logger.info(f"Successfully processed: {successful} books") logger.info(f"Failed to process: {failed} books") # Print vector store stats stats = vector_store.get_stats() logger.info(f"\nVector Store Statistics:") logger.info(f" - Total books: {stats['total_books']}") logger.info(f" - Total chunks: {stats['total_chunks']}") logger.info(f" - Available books: {', '.join(stats['books'])}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/trakru/mcp-library-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index_books.py•5.54 KiB