Crawl4AI+SearXNG MCP Server

url_helpers.py•5.66 KiB

"""URL helper utilities for the Crawl4AI MCP server.""" from urllib.parse import urldefrag, urlparse from xml.etree import ElementTree as ET import httpx from src.core.exceptions import NetworkError from src.core.logging import logger def is_sitemap(url: str) -> bool: """ Check if a URL is a sitemap. Args: url: URL to check Returns: True if the URL is a sitemap, False otherwise """ return url.endswith("sitemap.xml") or "sitemap" in urlparse(url).path def is_txt(url: str) -> bool: """ Check if a URL is a text file. Args: url: URL to check Returns: True if the URL is a text file, False otherwise """ return url.endswith(".txt") def parse_sitemap_content(xml_content: str) -> list[str]: """ Parse sitemap XML content and extract URLs. This function parses XML content directly without making HTTP requests, making it suitable for use in async contexts where the content has already been fetched. Args: xml_content: XML content string Returns: List of URLs found in the sitemap """ try: tree = ET.fromstring(xml_content) urls = [loc.text for loc in tree.findall(".//{*}loc")] return [url for url in urls if url] # Filter None values except ET.ParseError as e: logger.error(f"XML parse error in sitemap: {e}") return [] except Exception as e: logger.error(f"Unexpected error parsing sitemap XML: {e}") return [] def parse_sitemap(sitemap_url: str) -> list[str]: """ Parse a sitemap from URL and extract URLs (synchronous, blocking). Note: This function blocks the event loop. For async code, fetch the sitemap content with httpx.AsyncClient and use parse_sitemap_content(). Args: sitemap_url: URL of the sitemap Returns: List of URLs found in the sitemap """ try: # Using httpx sync client (still blocks, but consistent with async version) with httpx.Client() as client: resp = client.get(sitemap_url) if resp.status_code == 200: return parse_sitemap_content(resp.text) except httpx.HTTPError as e: logger.error(f"HTTP error fetching sitemap: {e}") except NetworkError as e: logger.error(f"Network error fetching sitemap: {e}") except Exception as e: logger.error(f"Unexpected error fetching sitemap: {e}") return [] def normalize_url(url: str) -> str: """ Normalize a URL by removing the fragment. Args: url: URL to normalize Returns: Normalized URL without fragment """ return urldefrag(url)[0] def sanitize_url_for_logging(url: str) -> str: """ Sanitize a URL for safe logging by removing sensitive information. This function removes authentication tokens, API keys, and other sensitive parameters from URLs before they are logged. Args: url: URL to sanitize Returns: Sanitized URL safe for logging """ if not url: return "" try: parsed = urlparse(url) # Build sanitized URL without query parameters and fragments sanitized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" # If there were query parameters, indicate that they were removed if parsed.query: sanitized += "?[PARAMS_REMOVED]" # If there was a fragment, indicate that it was removed if parsed.fragment: sanitized += "#[FRAGMENT_REMOVED]" # Additional check for common auth patterns in the URL path if any( sensitive in parsed.path.lower() for sensitive in ["token", "key", "auth", "secret", "password"] ): # Replace the path with a generic message sanitized = f"{parsed.scheme}://{parsed.netloc}/[SENSITIVE_PATH]" return sanitized except ValueError as e: # URL parsing validation error logger.debug(f"Invalid URL for sanitization: {e}") return "[INVALID_URL]" except Exception: # If parsing fails for any other reason, return a generic placeholder return "[INVALID_URL]" def clean_url(url: str) -> str: """ Clean and normalize a URL for processing. This function: - Strips whitespace - Removes quotes - Ensures proper URL format Args: url: URL to clean Returns: Cleaned URL or empty string if invalid """ if not url: return "" # Strip whitespace and quotes cleaned = url.strip().strip("\"'") # Basic validation - must start with http:// or https:// if not cleaned.startswith(("http://", "https://")): return "" return cleaned def extract_domain_from_url(url: str) -> str | None: """ Extract domain from URL for use as source identifier. Examples: - "https://example.com/path" -> "example.com" - "https://www.example.com/path" -> "example.com" - "https://subdomain.example.com/path" -> "subdomain.example.com" - Invalid URL -> None Args: url: URL to extract domain from Returns: Domain string or None if extraction fails """ if not url: return None try: from urllib.parse import urlparse parsed = urlparse(url) if not parsed.netloc: return None domain = parsed.netloc.lower() # Remove 'www.' prefix if present if domain.startswith("www."): domain = domain[4:] return domain except ValueError as e: # URL parsing validation error logger.debug(f"Invalid URL for domain extraction: {e}") return None except Exception: return None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

url_helpers.py•5.66 KiB