Crawl4Claude

query_docs.py•13.7 kB

#!/usr/bin/env python3 """ Documentation Query Tool Utility script to query and analyze scraped documentation databases. Provides various ways to search and extract information from the documentation. Usage examples: python query_docs.py --search "tutorial" python query_docs.py --section "getting-started" python query_docs.py --stats python query_docs.py --export-section "api" --format markdown """ import argparse import json import sqlite3 import sys from pathlib import Path from typing import List, Dict, Optional # Import configuration try: from config import SCRAPER_CONFIG, get_database_path, get_docs_display_name except ImportError as e: print("❌ Configuration file 'config.py' is required but not found.") print("Please ensure config.py exists and contains the required configuration variables.") sys.exit(1) class DocumentationQuery: """Query interface for scraped documentation databases""" def __init__(self, db_path: str = None): if db_path is None: # Use config helper to get database path db_path = get_database_path() self.db_path = Path(db_path) if not self.db_path.exists(): raise FileNotFoundError(f"Database not found: {db_path}") # Get base URL for display purposes self.base_url = SCRAPER_CONFIG.get("base_url", "") self.domain_name = get_docs_display_name() def _extract_domain_name(self) -> str: """Extract domain name from base URL for display (deprecated - use get_docs_display_name())""" return get_docs_display_name() def search_content(self, query: str, limit: int = 10) -> List[Dict]: """Search for content containing the query string""" with sqlite3.connect(self.db_path) as conn: conn.row_factory = sqlite3.Row # Try FTS search first try: cursor = conn.execute(""" SELECT p.url, p.title, p.section, p.subsection, p.word_count, snippet(pages_fts, 1, '<mark>', '</mark>', '...', 32) as snippet FROM pages_fts JOIN pages p ON pages_fts.rowid = p.id WHERE pages_fts MATCH ? ORDER BY rank LIMIT ? """, (query, limit)) results = [dict(row) for row in cursor.fetchall()] if results: return results except sqlite3.OperationalError: # FTS not available, fall back to LIKE search pass # Fallback LIKE search cursor = conn.execute(""" SELECT url, title, section, subsection, word_count, substr(markdown, 1, 200) as snippet FROM pages WHERE title LIKE ? OR markdown LIKE ? ORDER BY word_count DESC LIMIT ? """, (f"%{query}%", f"%{query}%", limit)) return [dict(row) for row in cursor.fetchall()] def get_by_section(self, section: str) -> List[Dict]: """Get all pages from a specific section""" with sqlite3.connect(self.db_path) as conn: conn.row_factory = sqlite3.Row cursor = conn.execute(""" SELECT url, title, section, subsection, word_count, markdown FROM pages WHERE section = ? ORDER BY title """, (section,)) return [dict(row) for row in cursor.fetchall()] def get_stats(self) -> Dict: """Get summary statistics about the documentation""" with sqlite3.connect(self.db_path) as conn: # Total pages and words cursor = conn.execute("SELECT COUNT(*), SUM(word_count) FROM pages") total_pages, total_words = cursor.fetchone() # Pages by section cursor = conn.execute(""" SELECT section, COUNT(*) as page_count, SUM(word_count) as word_count FROM pages WHERE section IS NOT NULL AND section != '' GROUP BY section ORDER BY page_count DESC """) sections = [{"section": row[0], "pages": row[1], "words": row[2]} for row in cursor.fetchall()] # Top pages by word count cursor = conn.execute(""" SELECT title, url, word_count FROM pages ORDER BY word_count DESC LIMIT 10 """) top_pages = [{"title": row[0], "url": row[1], "words": row[2]} for row in cursor.fetchall()] return { "total_pages": total_pages, "total_words": total_words, "sections": sections, "top_pages": top_pages, "domain_name": self.domain_name, "base_url": self.base_url } def get_all_sections(self) -> List[str]: """Get list of all documentation sections""" with sqlite3.connect(self.db_path) as conn: cursor = conn.execute(""" SELECT DISTINCT section FROM pages WHERE section IS NOT NULL AND section != '' ORDER BY section """) return [row[0] for row in cursor.fetchall()] def export_section(self, section: str, format: str = "markdown") -> str: """Export a section in the specified format""" pages = self.get_by_section(section) if format.lower() == "json": return json.dumps(pages, indent=2, ensure_ascii=False) elif format.lower() == "markdown": output = [f"# {self.domain_name} Documentation: {section.title()}\n"] output.append(f"Total pages: {len(pages)}\n") if self.base_url: output.append(f"Source: {self.base_url}\n") output.append("---\n") for page in pages: output.append(f"## {page['title']}\n") output.append(f"**URL:** {page['url']}\n") if page['subsection']: output.append(f"**Subsection:** {page['subsection']}\n") output.append(f"**Word Count:** {page['word_count']}\n\n") output.append(page['markdown'] + "\n\n") output.append("---\n\n") return "\n".join(output) else: raise ValueError(f"Unsupported format: {format}") def get_page_by_url(self, url: str) -> Optional[Dict]: """Get a specific page by URL""" with sqlite3.connect(self.db_path) as conn: conn.row_factory = sqlite3.Row cursor = conn.execute(""" SELECT * FROM pages WHERE url = ? """, (url,)) row = cursor.fetchone() return dict(row) if row else None def get_default_db_path() -> str: """Get the default database path from config""" return get_database_path() def main(): # Get domain info for help text domain_name = get_docs_display_name() base_url = SCRAPER_CONFIG.get("base_url", "documentation site") parser = argparse.ArgumentParser( description=f"Query and analyze scraped documentation database for {domain_name}", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=f""" Examples: python query_docs.py --search "tutorial guide" python query_docs.py --section "getting-started" python query_docs.py --stats python query_docs.py --export-section "api" --format markdown > api.md python query_docs.py --list-sections """ ) default_db = get_default_db_path() parser.add_argument("--db", default=default_db, help=f"Path to the documentation database (default: {default_db})") # Search operations parser.add_argument("--search", metavar="QUERY", help="Search for content containing the query") parser.add_argument("--section", metavar="SECTION", help="Get all pages from a specific section") parser.add_argument("--url", metavar="URL", help="Get a specific page by URL") # Export operations parser.add_argument("--export-section", metavar="SECTION", help="Export a section to stdout") parser.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Export format (default: markdown)") # Information operations parser.add_argument("--stats", action="store_true", help="Show database statistics") parser.add_argument("--list-sections", action="store_true", help="List all available sections") # Options parser.add_argument("--limit", type=int, default=10, help="Limit number of search results (default: 10)") args = parser.parse_args() try: query_tool = DocumentationQuery(args.db) if args.search: print(f"🔍 Searching for: '{args.search}'\n") results = query_tool.search_content(args.search, args.limit) if not results: print("No results found.") return for i, result in enumerate(results, 1): print(f"{i}. **{result['title']}**") print(f" Section: {result['section']}") print(f" URL: {result['url']}") print(f" Words: {result['word_count']}") if result.get('snippet'): print(f" Preview: {result['snippet']}") print() elif args.section: print(f"📚 Pages in section: '{args.section}'\n") pages = query_tool.get_by_section(args.section) if not pages: print("No pages found in this section.") return for page in pages: print(f"• {page['title']}") print(f" URL: {page['url']}") print(f" Words: {page['word_count']}") if page['subsection']: print(f" Subsection: {page['subsection']}") print() elif args.url: print(f"📄 Page details for: {args.url}\n") page = query_tool.get_page_by_url(args.url) if not page: print("Page not found.") return print(f"**Title:** {page['title']}") print(f"**Section:** {page['section']}") if page['subsection']: print(f"**Subsection:** {page['subsection']}") print(f"**Word Count:** {page['word_count']}") print(f"**Scraped:** {page['scraped_at']}") print("\n**Content:**") print(page['markdown'][:500] + "..." if len(page['markdown']) > 500 else page['markdown']) elif args.export_section: content = query_tool.export_section(args.export_section, args.format) print(content) elif args.stats: print(f"📊 {query_tool.domain_name} Documentation Statistics\n") stats = query_tool.get_stats() print(f"**Source:** {stats['base_url'] or 'Local database'}") print(f"**Database:** {query_tool.db_path}") print(f"**Total Pages:** {stats['total_pages']:,}") print(f"**Total Words:** {stats['total_words']:,}") print() if stats['sections']: print("**Pages by Section:**") for section in stats['sections']: section_name = section['section'] or 'General' print(f" {section_name}: {section['pages']} pages ({section['words']:,} words)") print() if stats['top_pages']: print("**Largest Pages:**") for page in stats['top_pages']: print(f" {page['title']}: {page['words']:,} words") print(f" {page['url']}") print() elif args.list_sections: print(f"📋 Available Sections in {query_tool.domain_name}\n") sections = query_tool.get_all_sections() if sections: for section in sections: print(f"• {section}") else: print("No sections found in the database.") else: parser.print_help() except FileNotFoundError as e: print(f"❌ Error: {e}") print("Make sure you've run the scraper first: python docs_scraper.py") sys.exit(1) except Exception as e: print(f"❌ Error: {e}") sys.exit(1) if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dragomirweb/Crawl4Claude'

If you have feedback or need assistance with the MCP directory API, please join our Discord server