MCP Jina Supabase RAG

url_discoverer.py•9.98 KiB

""" URL Discovery Layer Handles discovering URLs from documentation sites using multiple strategies: 1. Sitemap.xml parsing (fastest) 2. Common documentation patterns 3. Crawl4AI recursive discovery (fallback) """ import asyncio import logging from dataclasses import dataclass from typing import List, Optional, Literal from urllib.parse import urljoin, urlparse import xml.etree.ElementTree as ET from fnmatch import fnmatch import httpx from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode logger = logging.getLogger(__name__) @dataclass class DiscoveryResult: """Result of URL discovery operation""" urls: List[str] method: Literal["sitemap", "patterns", "crawl", "manual"] total_found: int filtered_count: int error: Optional[str] = None class URLDiscoverer: """Discovers URLs from documentation sites using multiple strategies""" def __init__( self, timeout: int = 30, max_crawl_depth: int = 3, max_urls: int = 1000 ): self.timeout = timeout self.max_crawl_depth = max_crawl_depth self.max_urls = max_urls async def discover( self, url_pattern: str, method: Literal["auto", "sitemap", "crawl", "manual"] = "auto" ) -> DiscoveryResult: """ Discover URLs using the specified method Args: url_pattern: URL or pattern (e.g., https://docs.example.com/*) method: Discovery method to use Returns: DiscoveryResult with discovered URLs """ logger.info(f"Starting URL discovery for {url_pattern} using method: {method}") # Parse the pattern to get base URL base_url = self._extract_base_url(url_pattern) try: if method == "manual": # Single URL, no discovery needed return DiscoveryResult( urls=[url_pattern], method="manual", total_found=1, filtered_count=1 ) if method == "auto" or method == "sitemap": # Try sitemap first result = await self._try_sitemap(base_url, url_pattern) if result.urls: logger.info(f"Sitemap discovery found {len(result.urls)} URLs") return result if method == "sitemap": # User specifically requested sitemap, don't fallback return DiscoveryResult( urls=[], method="sitemap", total_found=0, filtered_count=0, error="No sitemap found" ) if method == "auto" or method == "crawl": # Fallback to crawl4ai recursive discovery logger.info("Falling back to Crawl4AI discovery") result = await self._crawl4ai_discover(url_pattern) return result # Shouldn't reach here return DiscoveryResult( urls=[], method=method, total_found=0, filtered_count=0, error=f"Unknown method: {method}" ) except Exception as e: logger.error(f"Discovery failed: {e}") return DiscoveryResult( urls=[], method=method, total_found=0, filtered_count=0, error=str(e) ) def _extract_base_url(self, url_pattern: str) -> str: """Extract base URL from pattern""" # Remove wildcard if present url = url_pattern.replace('*', '') parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}" async def _try_sitemap( self, base_url: str, url_pattern: str ) -> DiscoveryResult: """Try to discover URLs from sitemap.xml""" sitemap_locations = [ f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml", f"{base_url}/docs/sitemap.xml", f"{base_url}/api/sitemap.xml", ] async with httpx.AsyncClient(timeout=self.timeout) as client: for sitemap_url in sitemap_locations: try: logger.debug(f"Trying sitemap: {sitemap_url}") response = await client.get(sitemap_url) if response.status_code == 200: urls = self._parse_sitemap(response.text, url_pattern) if urls: return DiscoveryResult( urls=urls[:self.max_urls], method="sitemap", total_found=len(urls), filtered_count=len(urls[:self.max_urls]) ) except Exception as e: logger.debug(f"Sitemap {sitemap_url} failed: {e}") continue return DiscoveryResult( urls=[], method="sitemap", total_found=0, filtered_count=0 ) def _parse_sitemap(self, content: str, pattern: str) -> List[str]: """Parse sitemap XML and filter URLs by pattern""" try: root = ET.fromstring(content) urls = [] # Handle both regular sitemaps and sitemap indexes # Check for sitemap namespace namespaces = { 'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9' } # Try with namespace first locs = root.findall('.//sm:loc', namespaces) if not locs: # Try without namespace locs = root.findall('.//loc') for loc in locs: url = loc.text if url: # Check if this is a sitemap index pointing to other sitemaps if url.endswith('.xml'): # This might be a sitemap index, skip for now # TODO: Recursively fetch sub-sitemaps continue # Filter by pattern if self._matches_pattern(url, pattern): urls.append(url) return urls except ET.ParseError as e: logger.error(f"Failed to parse sitemap XML: {e}") return [] def _matches_pattern(self, url: str, pattern: str) -> bool: """Check if URL matches the given pattern""" # If pattern has wildcard, use fnmatch if '*' in pattern: return fnmatch(url, pattern) # Otherwise, check if URL starts with pattern (prefix match) pattern_clean = pattern.rstrip('/') return url.startswith(pattern_clean) async def _crawl4ai_discover(self, url_pattern: str) -> DiscoveryResult: """Use Crawl4AI to recursively discover URLs""" discovered_urls = set() base_url = self._extract_base_url(url_pattern) start_url = url_pattern.replace('*', '') # Queue for BFS crawling to_visit = [(start_url, 0)] # (url, depth) visited = set() browser_config = BrowserConfig( headless=True, verbose=False ) crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, js_code=[ # Wait for any dynamic content "await new Promise(r => setTimeout(r, 1000));" ] ) try: async with AsyncWebCrawler(config=browser_config) as crawler: while to_visit and len(discovered_urls) < self.max_urls: current_url, depth = to_visit.pop(0) if current_url in visited: continue if depth >= self.max_crawl_depth: continue visited.add(current_url) try: logger.debug(f"Crawling {current_url} (depth: {depth})") result = await crawler.arun( url=current_url, config=crawler_config ) if result.success: # Add current URL if it matches pattern if self._matches_pattern(current_url, url_pattern): discovered_urls.add(current_url) # Extract and queue internal links if result.links: for link_data in result.links.get('internal', []): link = link_data.get('href', '') if link: # Normalize URL full_url = urljoin(base_url, link) # Only follow links within the same domain if full_url.startswith(base_url): if full_url not in visited: to_visit.append((full_url, depth + 1)) # Be nice to the server await asyncio.sleep(0.5) except Exception as e: logger.warning(f"Failed to crawl {current_url}: {e}") continue urls = list(discovered_urls)[:self.max_urls] return DiscoveryResult( urls=urls, method="crawl", total_found=len(discovered_urls), filtered_count=len(urls) ) except Exception as e: logger.error(f"Crawl4AI discovery failed: {e}") return DiscoveryResult( urls=[], method="crawl", total_found=0, filtered_count=0, error=str(e) )

Loading blob content...

Latest Blog Posts

MCP isn't dead–it's maturing
By punkpeye on January 20, 2026.
mcp
Google's AI Overview Has Been Sending Me the Wrong Customers for 6 Months
By punkpeye on January 20, 2026.
google
ai
startups
Expose Your Local MCP Server to the Internet
By punkpeye on January 19, 2026.
MCP Inspector
mcp
tutorial

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/croakingtoad/mcp-jina-supabase-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

url_discoverer.py•9.98 KiB