RivalSearchMCP

parsers.py•6.37 KiB

#!/usr/bin/env python3 """ Unified HTML parsers for RivalSearchMCP. Consolidates the best parsing methods from all modules. """ import re from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple from bs4 import BeautifulSoup, Tag from src.logging.logger import logger class BaseHTMLParser(ABC): """Base class for HTML parsers.""" def __init__(self): """Initialize the parser.""" pass @abstractmethod def parse(self, html_content: str, **kwargs) -> Any: """Parse HTML content using the parser's method.""" pass class UnifiedHTMLParser(BaseHTMLParser): """Unified HTML parser with multiple parsing strategies.""" def parse(self, html_content: str, **kwargs) -> Dict[str, Any]: """Parse HTML structure and extract comprehensive information.""" if not html_content: return {} try: soup = BeautifulSoup(html_content, 'html.parser') structure = { "title": "", "meta_description": "", "headings": [], "links": [], "images": [], "forms": [], "tables": [], "main_content": "", "metadata": {} } # Extract title title_tag = soup.find("title") if isinstance(title_tag, Tag) and hasattr(title_tag, 'get_text'): structure["title"] = title_tag.get_text().strip() # Extract meta description meta_desc = soup.find("meta", attrs={"name": "description"}) if isinstance(meta_desc, Tag) and hasattr(meta_desc, 'get'): structure["meta_description"] = meta_desc.get("content", "") # Extract headings for i in range(1, 7): headings = soup.find_all(f"h{i}") for heading in headings: if isinstance(heading, Tag) and hasattr(heading, 'get_text'): structure["headings"].append({ "level": i, "text": heading.get_text().strip(), "id": heading.get("id", "") }) # Extract links links = soup.find_all("a", href=True) for link in links: if isinstance(link, Tag) and hasattr(link, 'get'): structure["links"].append({ "text": link.get_text().strip(), "href": link.get("href", ""), "title": link.get("title", "") }) # Extract images images = soup.find_all("img") for img in images: if isinstance(img, Tag) and hasattr(img, 'get'): structure["images"].append({ "alt": img.get("alt", ""), "src": img.get("src", ""), "title": img.get("title", "") }) # Extract forms forms = soup.find_all("form") for form in forms: if isinstance(form, Tag) and hasattr(form, 'get'): inputs = form.find_all("input") structure["forms"].append({ "action": form.get("action", ""), "method": form.get("method", ""), "inputs": len(inputs) if inputs else 0 }) # Extract tables tables = soup.find_all("table") for table in tables: if isinstance(table, Tag): rows = table.find_all("tr") cell_count = 0 for row in rows: if isinstance(row, Tag): cells = row.find_all(["td", "th"]) cell_count += len(cells) if cells else 0 structure["tables"].append({ "rows": len(rows), "cells": cell_count }) # Extract metadata structure["metadata"] = self._extract_metadata(soup) return structure except Exception as e: logger.error(f"HTML structure extraction failed: {e}") return {} def _extract_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]: """Extract metadata from HTML content.""" metadata = {} # Extract meta tags meta_tags = soup.find_all("meta") for meta in meta_tags: if isinstance(meta, Tag) and hasattr(meta, 'get'): name = meta.get("name", meta.get("property", "")) content = meta.get("content", "") if name and content: metadata[name] = content # Extract Open Graph tags og_tags = soup.find_all("meta", property=re.compile(r"^og:")) for og in og_tags: if isinstance(og, Tag) and hasattr(og, 'get'): property_name = og.get("property", "") content = og.get("content", "") if property_name and content: metadata[property_name] = content return metadata class GoogleSearchParser(BaseHTMLParser): """Google search-specific HTML parser.""" def parse(self, html_content: str, **kwargs) -> List[Dict[str, str]]: """Parse Google search results HTML.""" from .extractors import GoogleSpecificExtractor extractor = GoogleSpecificExtractor() return extractor.extract(html_content, **kwargs) class DocumentationParser(BaseHTMLParser): """Documentation website HTML parser.""" def parse(self, html_content: str, **kwargs) -> Dict[str, Any]: """Parse documentation HTML for LLMs.txt generation.""" from .extractors import GenericContentExtractor extractor = GenericContentExtractor() main_content = extractor.extract(html_content, **kwargs) # Parse structure structure_parser = UnifiedHTMLParser() structure = structure_parser.parse(html_content, **kwargs) # Add main content structure["main_content"] = main_content return structure

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DamionR/RivalSearchMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parsers.py•6.37 KiB