RivalSearchMCP

extractors.py•13.2 kB

#!/usr/bin/env python3 """ Unified content extractors for RivalSearchMCP. Consolidates the best extraction methods from all modules. """ import re from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Union from bs4 import BeautifulSoup, Tag from src.logging.logger import logger # Performance optimization imports try: from selectolax.parser import HTMLParser SELECTOLAX_AVAILABLE = True except ImportError: SELECTOLAX_AVAILABLE = False try: import lxml LXML_AVAILABLE = True except ImportError: LXML_AVAILABLE = False class BaseContentExtractor(ABC): """Base class for content extractors.""" def __init__(self): """Initialize the extractor.""" pass @abstractmethod def extract(self, content: str, **kwargs) -> Any: """Extract content using the extractor's method.""" pass class UnifiedContentExtractor(BaseContentExtractor): """Unified content extractor with multiple fallback methods.""" def extract(self, html_content: str, **kwargs) -> str: """Extract main content from HTML using multiple optimized methods as fallbacks.""" if not html_content: return "" try: # Method 1: Use selectolax for ultra-fast parsing (if available) if SELECTOLAX_AVAILABLE: try: parser = HTMLParser(html_content) # Try multiple content selectors with selectolax content_selectors = [ "main", '[role="main"]', ".main-content", ".content", ".post-content", ".article-content", "#content", "#main", ".entry-content", ".post-body", ".article-body" ] for selector in content_selectors: try: element = parser.css_first(selector) if element: content = element.text(separator=" ", strip=True) if len(content) > 100: logger.debug(f"Method 1 (selectolax {selector}) succeeded") return content except Exception: continue # Try body extraction with selectolax try: body = parser.css_first("body") if body: content = body.text(separator=" ", strip=True) if len(content) > 100: logger.debug("Method 1 (selectolax body) succeeded") return content except Exception: pass except Exception as e: logger.debug(f"selectolax method failed: {e}") # Method 2: Use BeautifulSoup with lxml parser (faster than html.parser) parser_name = 'lxml' if LXML_AVAILABLE else 'html.parser' soup = BeautifulSoup(html_content, parser_name) # Try multiple content selectors content_selectors = [ "main", '[role="main"]', ".main-content", ".content", ".post-content", ".article-content", "#content", "#main", ".entry-content", ".post-body", ".article-body" ] for selector in content_selectors: try: element = soup.select_one(selector) if element and hasattr(element, 'get_text'): content = element.get_text(separator=" ", strip=True) if len(content) > 100: logger.debug(f"Method 2 (BeautifulSoup {selector}) succeeded") return content except Exception: continue # Method 3: Extract from body with cleanup try: # Remove unwanted elements for tag in soup(["script", "style", "nav", "footer", "header", "aside", "menu"]): if isinstance(tag, Tag) and hasattr(tag, 'decompose'): tag.decompose() body = soup.find("body") if isinstance(body, Tag) and hasattr(body, 'get_text'): content = body.get_text(separator=" ", strip=True) if len(content) > 100: logger.debug("Method 3 (body cleanup) succeeded") return content except Exception: pass # Method 4: Fallback to regex-based extraction try: # Remove HTML tags and clean up text_content = re.sub(r'<[^>]+>', '', html_content) text_content = re.sub(r'\s+', ' ', text_content) text_content = text_content.strip() if len(text_content) > 100: logger.debug("Method 4 (regex) succeeded") return text_content except Exception: pass # Method 5: Last resort - extract from title and any text try: title = soup.find("title") if isinstance(title, Tag) and hasattr(title, 'get_text'): title_text = title.get_text() if len(title_text) > 10: logger.debug("Method 5 (title) succeeded") return title_text except Exception: pass logger.warning("All content extraction methods failed") return "" except Exception as e: logger.error(f"Content extraction failed: {e}") return "" class GoogleSpecificExtractor(BaseContentExtractor): """Google-specific content extractor with specialized selectors.""" def extract(self, html_content: str, **kwargs) -> List[Dict[str, str]]: """Extract Google search results using specialized selectors.""" max_results = kwargs.get('max_results', 10) soup = BeautifulSoup(html_content, "html.parser") results = [] seen_urls = set() # Google-specific selector sets (from extract.py) selector_sets = [ {"container": "#search div[data-hveid]", "title": "h3", "snippet": ".VwiC3b"}, {"container": "#rso div[data-hveid]", "title": "h3", "snippet": '[data-sncf="1"]'}, {"container": ".g", "title": "h3", "snippet": 'div[style*="webkit-line-clamp"]'}, {"container": "div[jscontroller][data-hveid]", "title": "h3", "snippet": 'div[role="text"]'}, ] alt_snippet_selectors = [ ".VwiC3b", '[data-sncf="1"]', 'div[style*="webkit-line-clamp"]', 'div[role="text"]', ] for selectors in selector_sets: if len(results) >= max_results: break containers = soup.select(selectors["container"]) for container in containers: if len(results) >= max_results: break # Extract title title_elem = container.select_one(selectors["title"]) if not title_elem: continue title = title_elem.text.strip() link = "" # Extract link with multiple fallback strategies link_in_title = title_elem.find_parent("a") if link_in_title and isinstance(link_in_title, Tag) and hasattr(link_in_title, "get"): link = link_in_title.get("href", "") else: parent = title_elem.parent while parent and parent.name != "a": parent = parent.parent if parent and parent.name == "a" and isinstance(parent, Tag) and hasattr(parent, "get"): link = parent.get("href", "") else: container_link = container.find("a") if container_link and isinstance(container_link, Tag) and hasattr(container_link, "get"): link = container_link.get("href", "") if ( not link or not isinstance(link, str) or not link.startswith("http") or link in seen_urls ): continue # Extract snippet snippet = "" snippet_elem = container.select_one(selectors["snippet"]) if snippet_elem: snippet = snippet_elem.text.strip() else: for alt in alt_snippet_selectors: elem = container.select_one(alt) if elem: snippet = elem.text.strip() break if not snippet: text_divs = [] for div in container.find_all("div"): if isinstance(div, Tag) and not div.find("h3") and len(div.text.strip()) > 20: text_divs.append(div) if text_divs: snippet = text_divs[0].text.strip() if title and link: results.append({"title": title, "link": link, "snippet": snippet}) seen_urls.add(link) # Fallback: extract from any HTTP links if len(results) < max_results: anchors = soup.select("a[href^='http']") for a in anchors: if len(results) >= max_results: break link = a.get("href", "") if ( not link or not isinstance(link, str) or not link.startswith("http") or "google.com" in link or link in seen_urls ): continue title = a.text.strip() if not title: continue snippet = "" parent = a.parent for _ in range(3): if parent: text = parent.text.strip() if len(text) > 20 and text != title: snippet = text break parent = parent.parent results.append({"title": title, "link": link, "snippet": snippet}) seen_urls.add(link) return results[:max_results] class GenericContentExtractor(BaseContentExtractor): """Generic content extractor for documentation and general websites.""" def extract(self, html_content: str, **kwargs) -> str: """Extract main content using generic selectors and element removal.""" soup = BeautifulSoup(html_content, "html.parser") # Try to find main content areas (from llms generator) main_selectors = [ "main", '[role="main"]', ".main-content", ".content", ".post-content", ".article-content", "#content", "#main", ] for selector in main_selectors: main_element = soup.select_one(selector) if main_element: return str(main_element) # Fallback: remove unwanted elements and get body content self._remove_unwanted_elements(soup) body = soup.find("body") if body: return str(body) return str(soup) def _remove_unwanted_elements(self, soup): """Remove unwanted HTML elements.""" # Remove script and style elements for element in soup(["script", "style", "noscript", "iframe", "embed", "object"]): element.decompose() # Remove navigation, footer, header elements for element in soup(["nav", "footer", "header", "aside", "menu"]): element.decompose() # Remove common ad and tracking elements for element in soup.find_all( class_=re.compile( r"(ad|ads|advertisement|banner|tracking|analytics|cookie|popup|modal|overlay)", re.I, ) ): element.decompose()

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DamionR/RivalSearchMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extractors.py•13.2 kB