RivalSearchMCP

generator.py•20.2 KiB

""" LLMs.txt Generator core functionality for RivalSearchMCP. Handles website documentation generation following the llmstxt.org specification. """ import json import re from pathlib import Path from typing import Any, Dict, List, Optional from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from src.logging.logger import logger class LLMsTxtGenerator: """ Generic LLMs.txt generator that can work with any documentation website. Follows the llmstxt.org specification for documentation generation. """ def __init__(self, config: Dict[str, Any]): """ Initialize the generator with a configuration. Args: config: Configuration dictionary containing: - name: Project name - description: Project description - urls: List of URLs to start crawling from - content_selectors: CSS selectors for content extraction - title_selectors: CSS selectors for title extraction - category_rules: Rules for categorizing pages - rate_limit: Delay between requests (seconds) - max_pages: Maximum number of pages to process - user_agent: User agent string - traversal_mode: Traversal mode - "docs", "research", "map" """ self.config = config self.visited_urls = set() self.pages_data = [] self.session = requests.Session() self.session.headers.update( {"User-Agent": config.get("user_agent", "LLMs.txt Generator/1.0")} ) # Initialize advanced components self.content_processor = ContentProcessor() def discover_pages(self) -> List[Dict[str, str]]: """Discover pages using simple link discovery.""" discovered_pages = [] base_urls = self.config.get("urls", []) max_pages = self.config.get("max_pages", 100) logger.info("Using simple page discovery") logger.info(f"Base URLs: {base_urls}") for base_url in base_urls: logger.info(f"Processing base URL: {base_url}") # Use simple link discovery to find pages discovered_urls = self._simple_link_discovery(base_url, max_pages) # Add the base URL itself if not already found if base_url not in discovered_urls: discovered_urls.insert(0, base_url) for url in discovered_urls: if len(discovered_pages) >= max_pages: break if url not in [page["url"] for page in discovered_pages]: discovered_pages.append({"url": url, "source": "link_discovery"}) logger.info(f"Discovered {len(discovered_pages)} pages using simple discovery") return discovered_pages or [] def _simple_link_discovery(self, base_url: str, max_pages: int) -> List[str]: """Simple link discovery as fallback.""" discovered_urls = [] try: html_content = self.get_page_content(base_url) if html_content: soup = BeautifulSoup(html_content, "html.parser") # Find all links for link in soup.find_all("a", href=True): from bs4 import Tag if isinstance(link, Tag): href_attr = link["href"] if href_attr and isinstance(href_attr, str): full_url = self._resolve_url(base_url, href_attr) if full_url and full_url not in discovered_urls: discovered_urls.append(full_url) if len(discovered_urls) >= max_pages: break except Exception as e: logger.warning(f"Link discovery failed for {base_url}: {e}") return discovered_urls def get_page_content(self, url: str) -> Optional[str]: """Get page content.""" try: response = self.session.get(url, timeout=30) response.raise_for_status() return response.text except Exception as e: logger.warning(f"Failed to get content from {url}: {e}") return None def _resolve_url(self, base_url: str, href: str) -> Optional[str]: """Resolve relative URLs to absolute URLs.""" try: # Skip external links, javascript, mailto, etc. if href.startswith( ("http://", "https://", "javascript:", "mailto:", "tel:") ): return None # Resolve relative URL full_url = urljoin(base_url, href) # Only include same-domain URLs base_domain = urlparse(base_url).netloc full_domain = urlparse(full_url).netloc if base_domain == full_domain: return full_url return None except Exception: return None def process_pages(self, discovered_pages: List[Dict[str, str]]): """Process discovered pages to extract content.""" logger.info("Starting page processing...") for i, page_info in enumerate(discovered_pages, 1): url = page_info["url"] if url in self.visited_urls: continue logger.info(f"Processing page {i}/{len(discovered_pages)}: {url}") try: content = self.get_page_content(url) if content: soup = BeautifulSoup(content, "html.parser") # Extract title title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "Untitled" # Extract main content main_content = self._extract_main_content(soup) # Clean content clean_content = self._clean_content(main_content) # Categorize page category = self._categorize_page(title, clean_content, url) # Store page data self.pages_data.append( { "url": url, "title": title, "content": clean_content, "category": category, "description": ( clean_content[:200] + "..." if len(clean_content) > 200 else clean_content ), } ) self.visited_urls.add(url) except Exception as e: logger.warning(f"Failed to process {url}: {e}") logger.info(f"Processed {len(self.pages_data)} pages") def _extract_main_content(self, soup) -> str: """Extract main content from HTML using unified extractor.""" try: # Use unified content extractor if available from src.core.content.utils import extract_main_content html_str = str(soup) return extract_main_content(html_str) except ImportError: # Fallback to original method # Try to find main content areas main_selectors = [ "main", '[role="main"]', ".main-content", ".content", ".post-content", ".article-content", "#content", "#main", ] for selector in main_selectors: main_element = soup.select_one(selector) if main_element: return str(main_element) # Fallback: remove navigation and get body content self._remove_unwanted_elements(soup) body = soup.find("body") if body: return str(body) return str(soup) def _remove_unwanted_elements(self, soup): """Remove unwanted HTML elements.""" # Remove script and style elements for element in soup( ["script", "style", "noscript", "iframe", "embed", "object"] ): element.decompose() # Remove navigation, footer, header elements for element in soup(["nav", "footer", "header", "aside", "menu"]): element.decompose() # Remove common ad and tracking elements for element in soup.find_all( class_=re.compile( r"(ad|ads|advertisement|banner|tracking|analytics|cookie|popup|modal|overlay)", re.I, ) ): element.decompose() def _clean_content(self, content: str) -> str: """Clean and format content using unified cleaner.""" try: # Use unified content cleaner if available from src.core.content.utils import clean_documentation return clean_documentation(content) except ImportError: # Fallback to original method # Remove extra whitespace content = re.sub(r"\s+", " ", content) # Remove HTML tags content = re.sub(r"<[^>]+>", "", content) # Clean up text content = content.strip() return content def _categorize_page(self, title: str, content: str, url: str) -> str: """Categorize page based on content and URL.""" title_lower = title.lower() content.lower() url_lower = url.lower() # Documentation categories if any( word in title_lower for word in ["api", "reference", "docs", "documentation"] ): return "API Reference" elif any( word in title_lower for word in ["guide", "tutorial", "how-to", "getting started"] ): return "Guides & Tutorials" elif any(word in title_lower for word in ["example", "sample", "demo"]): return "Examples & Demos" elif any(word in title_lower for word in ["install", "setup", "configuration"]): return "Installation & Setup" elif any( word in title_lower for word in ["faq", "help", "support", "troubleshooting"] ): return "Help & Support" elif any(word in url_lower for word in ["blog", "news", "announcement"]): return "Blog & News" else: return "Other" def generate_llms_txt(self, output_file: Path): """Generate llms.txt file following the llmstxt.org specification.""" logger.info(f"Generating {output_file}...") # Group pages by category categories = {} for page in self.pages_data: category = page["category"] if category not in categories: categories[category] = [] categories[category].append(page) with open(output_file, "w", encoding="utf-8") as f: # Write H1 title (required) f.write(f"# {self.config['name']}\n\n") # Write blockquote summary (required) f.write(f"> {self.config['description']}\n\n") # Write sections with H2 headers and full content for category in sorted(categories.keys()): if category == "Other": # Use "Optional" for the "Other" category as per spec f.write("## Optional\n\n") else: f.write(f"## {category}\n\n") for page in sorted(categories[category], key=lambda x: x["title"]): # Write the link first (following llmstxt.org format) description = page["description"] if page["description"] else "" f.write(f"- [{page['title']}]({page['url']})") if description: f.write(f": {description}") f.write("\n\n") # Write the full content f.write(page["content"]) f.write("\n\n---\n\n") logger.info( f"Generated {output_file} with full content from {len(self.pages_data)} pages" ) def generate_llms_full_txt(self, output_file: Path): """Generate llms-full.txt file with expanded content.""" logger.info(f"Generating {output_file}...") # Group pages by category categories = {} for page in self.pages_data: category = page["category"] if category not in categories: categories[category] = [] categories[category].append(page) with open(output_file, "w", encoding="utf-8") as f: # Write H1 title (required) f.write(f"# {self.config['name']}\n\n") # Write blockquote summary (required) f.write(f"> {self.config['description']}\n\n") # Write sections with H2 headers and full content for category in sorted(categories.keys()): if category == "Other": # Use "Optional" for the "Other" category as per spec f.write("## Optional\n\n") else: f.write(f"## {category}\n\n") for page in sorted(categories[category], key=lambda x: x["title"]): # Write the link first (following llmstxt.org format) description = page["description"] if page["description"] else "" f.write(f"- [{page['title']}]({page['url']})") if description: f.write(f": {description}") f.write("\n\n") # Write the full content f.write(page["content"]) f.write("\n\n---\n\n") logger.info( f"Generated {output_file} with full content from {len(self.pages_data)} pages" ) def generate_llms_ctx_files(self): """Generate llms-ctx.txt and llms-ctx-full.txt files following the spec.""" logger.info("\nGenerating llms-ctx.txt and llms-ctx-full.txt...") # Generate llms-ctx.txt (without Optional section) self._generate_ctx_file("llms-ctx.txt", include_optional=False) # Generate llms-ctx-full.txt (with Optional section) self._generate_ctx_file("llms-ctx-full.txt", include_optional=True) logger.info("Generated llms-ctx.txt and llms-ctx-full.txt files") def _generate_ctx_file(self, output_file: str, include_optional: bool = False): """Generate a context file with expanded content.""" # Group pages by category categories = {} for page in self.pages_data: category = page["category"] if category not in categories: categories[category] = [] categories[category].append(page) with open(output_file, "w", encoding="utf-8") as f: # Write XML-style header f.write( f'<project title="{self.config["name"]}" summary="{self.config["description"]}">\n' ) # Write sections for category in sorted(categories.keys()): if category == "Other" and not include_optional: continue f.write(f'<section name="{category}">\n') for page in sorted(categories[category], key=lambda x: x["title"]): f.write(f'<page title="{page["title"]}" url="{page["url"]}">\n') f.write(f'<content>{page["content"]}</content>\n') f.write("</page>\n") f.write("</section>\n") f.write("</project>") logger.info(f"Generated {output_file}") def save_data(self, output_file: str = "documentation_data.json"): """Save raw data for debugging.""" with open(output_file, "w") as f: json.dump(self.pages_data, f, indent=2, ensure_ascii=False) logger.info(f"Saved raw data to {output_file}") def run(self): """Main execution method.""" logger.info(f"LLMs.txt Generator for {self.config['name']}") logger.info("=" * 50) # Discover pages logger.info("Starting page discovery...") discovered_pages = self.discover_pages() logger.info(f"\nDiscovered {len(discovered_pages)} pages") # Process pages logger.info("Starting page processing...") self.process_pages(discovered_pages) # Generate output files following llmstxt.org specification logger.info("Starting file generation...") output_dir = self.config.get("output_dir", ".") # Generate files in the specified output directory self.generate_llms_txt(Path(output_dir) / "llms.txt") self.generate_llms_full_txt(Path(output_dir) / "llms-full.txt") self.save_data(str(Path(output_dir) / "documentation_data.json")) logger.info("\nGeneration complete!") logger.info(f"Processed {len(self.pages_data)} pages") logger.info("Generated files:") logger.info("- llms.txt (standard llmstxt.org format)") logger.info("- llms-full.txt (full content with expanded links)") logger.info("- documentation_data.json (raw data)") class ContentProcessor: """Advanced content processing for LLMs.txt generation.""" def __init__(self): """Initialize the content processor.""" def extract_metadata(self, soup) -> Dict[str, Any]: """Extract metadata from HTML content.""" metadata = {} # Extract meta tags meta_tags = soup.find_all("meta") for meta in meta_tags: name = meta.get("name", meta.get("property", "")) content = meta.get("content", "") if name and content: metadata[name] = content # Extract Open Graph tags og_tags = soup.find_all("meta", property=re.compile(r"^og:")) for og in og_tags: property_name = og.get("property", "") content = og.get("content", "") if property_name and content: metadata[property_name] = content return metadata def extract_links(self, soup, base_url: str) -> List[Dict[str, str]]: """Extract and categorize links from HTML content.""" links = [] for link in soup.find_all("a", href=True): href = link["href"] text = link.get_text(strip=True) if href and text: full_url = urljoin(base_url, href) link_type = self._categorize_link(href, text) links.append({"url": full_url, "text": text, "type": link_type}) return links def _categorize_link(self, href: str, text: str) -> str: """Categorize link based on URL and text.""" href_lower = href.lower() text_lower = text.lower() if any(word in href_lower for word in ["api", "docs", "reference"]): return "documentation" elif any(word in href_lower for word in ["guide", "tutorial", "how-to"]): return "guide" elif any(word in href_lower for word in ["example", "demo", "sample"]): return "example" elif any(word in href_lower for word in ["install", "setup", "get-started"]): return "setup" elif any(word in text_lower for word in ["download", "install", "setup"]): return "download" else: return "general" def clean_text_content(self, text: str) -> str: """Clean and normalize text content.""" # Remove extra whitespace text = re.sub(r"\s+", " ", text) # Remove special characters that might interfere with markdown text = re.sub(r'[^\w\s\-.,!?;:()[\]{}"\']', "", text) # Normalize quotes text = text.replace('"', '"').replace('"', '"') text = text.replace(""", "'").replace(""", "'") return text.strip() def extract_headings(self, soup) -> List[Dict[str, Any]]: """Extract heading structure from HTML content.""" headings = [] for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): level = int(heading.name[1]) text = heading.get_text(strip=True) if text: headings.append( {"level": level, "text": text, "id": heading.get("id", "")} ) return headings

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DamionR/RivalSearchMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

generator.py•20.2 KiB