Skip to main content
Glama

Python Dependency Manager Companion

by KemingHe
build_index.pyβ€’7.06 kB
#!/usr/bin/env python3 """ Build search index for Python dependency manager documentation. Creates a Tantivy full-text search index from all markdown files in the assets directory, using metadata from the auto-update workflow to tag documents by their source package manager. """ import logging from pathlib import Path from typing import Dict, Any import json import tantivy # Configure logging for visibility into indexing process logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Project paths ASSETS_DIR = Path("src/assets") INDEX_DIR = Path("src/index") def load_metadata(metadata_path: Path) -> Dict[str, Any]: """Load package metadata from _metadata.json file.""" try: with open(metadata_path, "r", encoding="utf-8") as f: return json.load(f) or {} except Exception as e: logger.warning(f"Failed to load metadata from {metadata_path}: {e}") return {} def create_schema() -> tantivy.Schema: """ Create Tantivy schema for documentation search. Fields: - content: Full-text searchable markdown content - path: File path for result retrieval - package: Source package manager (pip, conda, poetry, uv, pixi, pdm) for filtering - title: Document title extracted from file path - source_repo: Original repository from metadata - docs_path: Documentation directory path from metadata (for GitHub links) """ schema_builder = tantivy.SchemaBuilder() # Main searchable content - full-text with indexing and storage schema_builder.add_text_field("content", stored=True) # File path for retrieval - stored but not necessarily indexed for search schema_builder.add_text_field("path", stored=True) # Package name for filtering - indexed and stored schema_builder.add_text_field("package", stored=True) # Human-readable title - indexed and stored schema_builder.add_text_field("title", stored=True) # Source repository - indexed and stored schema_builder.add_text_field("source_repo", stored=True) # Documentation path - stored for GitHub link reconstruction schema_builder.add_text_field("docs_path", stored=True) return schema_builder.build() def extract_title_from_path(file_path: Path) -> str: """Extract path-notation title from file path.""" # Convert to path notation preserving directory structure # e.g., "src/assets/pip/cli/pip_install.rst" -> "pip/cli/pip_install.rst" parts = file_path.parts[2:] # Skip "src/assets" filtered_parts = [part for part in parts if part != "index.md"] return "/".join(filtered_parts) def find_markdown_files(assets_dir: Path) -> list[tuple[Path, str, Dict[str, Any]]]: """ Find all markdown files in assets directory. Returns: List of tuples (file_path, package_name, metadata) """ markdown_files = [] for package_dir in assets_dir.iterdir(): if not package_dir.is_dir(): continue package_name = package_dir.name metadata_path = package_dir / "_metadata.json" metadata = load_metadata(metadata_path) logger.info(f"Scanning {package_name} documentation...") # Find all markdown and reStructuredText files for pattern in ["**/*.md", "**/*.rst"]: for file_path in package_dir.glob(pattern): if file_path.is_file(): markdown_files.append((file_path, package_name, metadata)) return markdown_files def read_file_content(file_path: Path) -> str: """ Read file content with multiple encoding fallbacks. Note: This function is called only with paths from find_markdown_files(), which uses glob() to find files within the controlled ASSETS_DIR structure. The paths are generated by filesystem iteration, not user input. """ # Ensure the file exists and is a regular file (not a symlink to outside assets) try: if not file_path.exists() or not file_path.is_file(): logger.warning(f"File does not exist or is not a regular file: {file_path}") return "" # Simple safety check: ensure resolved path is still within assets if not file_path.resolve().is_relative_to(ASSETS_DIR.resolve()): logger.error(f"File path outside assets directory: {file_path}") return "" except (OSError, ValueError) as e: logger.warning(f"Path validation failed for {file_path}: {e}") return "" encodings = ["utf-8", "utf-8-sig", "latin1", "cp1252"] for encoding in encodings: try: with open(file_path, "r", encoding=encoding) as f: return f.read() except UnicodeDecodeError: continue except Exception as e: logger.warning(f"Error reading {file_path}: {e}") break logger.error(f"Failed to read {file_path} with any encoding") return "" def build_index(): """Build the Tantivy search index from all documentation files.""" logger.info("Starting index build process...") # Remove existing index to prevent schema conflicts if INDEX_DIR.exists(): logger.info(f"Removing existing index at {INDEX_DIR}") import shutil shutil.rmtree(INDEX_DIR) # Create index directory INDEX_DIR.mkdir(exist_ok=True) # Create schema and index schema = create_schema() index = tantivy.Index(schema, path=str(INDEX_DIR)) writer = index.writer(heap_size=100_000_000) # 100MB heap for better performance # Find all documentation files markdown_files = find_markdown_files(ASSETS_DIR) logger.info(f"Found {len(markdown_files)} documentation files") # Index each file indexed_count = 0 for file_path, package_name, metadata in markdown_files: try: content = read_file_content(file_path) if not content.strip(): continue title = extract_title_from_path(file_path) source_repo = metadata.get("source_repo", "unknown") docs_path = metadata.get("docs_path", "docs") # Create document doc = tantivy.Document( content=content, path=str(file_path.relative_to(ASSETS_DIR)), package=package_name, title=title, source_repo=source_repo, docs_path=docs_path, ) writer.add_document(doc) indexed_count += 1 if indexed_count % 100 == 0: logger.info(f"Indexed {indexed_count} documents...") except Exception as e: logger.error(f"Failed to index {file_path}: {e}") # Commit the index to disk logger.info("Committing index to disk...") writer.commit() logger.info(f"Index build complete! Indexed {indexed_count} documents") logger.info(f"Index stored in: {INDEX_DIR.absolute()}") if __name__ == "__main__": build_index()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/KemingHe/python-dependency-manager-companion-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server