search_url

Name	Required	Description	Default
`url`	Yes	The URL to fetch content from. Must be a valid HTTP/HTTPS URL.
`timeout`	No	Optional timeout in seconds (default: 10)

mcp_url_search_server.py:92-150 (handler)

The MCP tool handler function for 'search_url'. It processes the tool call, validates inputs, URL, fetches and extracts content using helper classes, formats the response with metadata and content, or returns appropriate error messages.

@self.server.call_tool() async def handle_call_tool(name: str, arguments: dict) -> list[TextContent]: """Handle tool calls.""" if name != "search_url": raise ValueError(f"Unknown tool: {name}") url = arguments.get("url") timeout = arguments.get("timeout", self.config.default_timeout) if not url: return [TextContent( type="text", text="Error: URL parameter is required" )] try: # Validate URL validation_result = self.url_validator.validate(url) if not validation_result.is_valid: return [TextContent( type="text", text=f"Error: Invalid URL - {validation_result.error_message}" )] # Get extractor and extract text content extractor = await self._get_extractor() result = await extractor.extract_text( url, timeout=timeout, user_agent=self.config.user_agent ) if result.success: # Prepare response with metadata response_text = f"Content from: {url}\n" response_text += f"Content-Type: {result.content_type}\n" response_text += f"Status Code: {result.status_code}\n" if result.title: response_text += f"Title: {result.title}\n" response_text += "\n--- Content ---\n" response_text += result.text_content return [TextContent( type="text", text=response_text )] else: return [TextContent( type="text", text=f"Error fetching content from {url}: {result.error_message}" )] except Exception as e: logger.exception(f"Unexpected error processing URL {url}") return [TextContent( type="text", text=f"Unexpected error: {str(e)}" )]

mcp_url_search_server.py:64-91 (registration)

Registers the 'search_url' tool with the MCP server by defining it in the list_tools handler, including name, description, and input schema.

@self.server.list_tools() async def handle_list_tools() -> list[Tool]: """List available tools.""" return [ Tool( name="search_url", description="Fetch and return plain-text content from a web page URL. " "Handles various content types and provides comprehensive error handling.", inputSchema={ "type": "object", "properties": { "url": { "type": "string", "description": "The URL to fetch content from. Must be a valid HTTP/HTTPS URL." }, "timeout": { "type": "number", "description": "Optional timeout in seconds (default: 10)", "default": 10, "minimum": 1, "maximum": 60 } }, "required": ["url"] } ) ]

text_extractor.py:54-199 (helper)

Core helper class method that performs the actual web fetching, rate limiting, content validation, text extraction using trafilatura or BeautifulSoup, and returns structured ExtractionResult used by the main handler.

async def extract_text(self, url: str, timeout: int = 10, user_agent: str = None) -> ExtractionResult: """ Extract text content from a URL. Args: url: The URL to fetch content from timeout: Request timeout in seconds user_agent: User agent string for the request Returns: ExtractionResult with extracted content or error details """ # Apply rate limiting if not self._check_rate_limit(): return ExtractionResult( success=False, error_message="Rate limit exceeded. Please try again later.", url=url ) # Record request time for rate limiting self.request_times.append(time.time()) try: # Ensure session exists await self._ensure_session(timeout) headers = { 'User-Agent': user_agent or 'MCP-URL-Search-Server/1.0.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.5', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } async with self.session.get(url, headers=headers) as response: status_code = response.status content_type = response.headers.get('content-type', '').lower() # Check status code if status_code == 404: return ExtractionResult( success=False, error_message="Page not found (404)", status_code=status_code, url=url ) elif status_code == 403: return ExtractionResult( success=False, error_message="Access forbidden (403)", status_code=status_code, url=url ) elif status_code == 429: return ExtractionResult( success=False, error_message="Too many requests (429). Please try again later.", status_code=status_code, url=url ) elif status_code >= 400: return ExtractionResult( success=False, error_message=f"HTTP error {status_code}", status_code=status_code, url=url ) # Check content type if not self._is_supported_content_type(content_type): return ExtractionResult( success=False, error_message=f"Unsupported content type: {content_type}", content_type=content_type, status_code=status_code, url=url ) # Check content length content_length = response.headers.get('content-length') if content_length and int(content_length) > 10 * 1024 * 1024: # 10MB return ExtractionResult( success=False, error_message="Content too large (>10MB)", content_type=content_type, status_code=status_code, url=url ) # Read content try: content = await response.text() except UnicodeDecodeError as e: return ExtractionResult( success=False, error_message=f"Failed to decode content: {str(e)}", content_type=content_type, status_code=status_code, url=url ) # Extract text content extracted_text, title = self._extract_text_content(content, content_type) if not extracted_text: return ExtractionResult( success=False, error_message="No readable text content found", content_type=content_type, status_code=status_code, url=url ) return ExtractionResult( success=True, text_content=extracted_text, title=title, content_type=content_type, status_code=status_code, url=url ) except asyncio.TimeoutError: return ExtractionResult( success=False, error_message=f"Request timed out after {timeout} seconds", url=url ) except aiohttp.ClientError as e: return ExtractionResult( success=False, error_message=f"Network error: {str(e)}", url=url ) except Exception as e: logger.exception(f"Unexpected error extracting text from {url}") return ExtractionResult( success=False, error_message=f"Unexpected error: {str(e)}", url=url )

url_validator.py:41-128 (helper)

Helper function that validates the input URL for security: checks format, scheme (only http/https), blocks local/private domains/IPs, length, etc., before allowing fetch.

def validate(self, url: str) -> ValidationResult: """ Validate a URL for safety and accessibility. Args: url: The URL to validate Returns: ValidationResult with validation status and details """ if not url: return ValidationResult( is_valid=False, error_message="URL cannot be empty" ) # Basic format validation if not isinstance(url, str): return ValidationResult( is_valid=False, error_message="URL must be a string" ) # Normalize URL (add protocol if missing) normalized_url = self._normalize_url(url) # Use validators library for basic validation if not validators.url(normalized_url): return ValidationResult( is_valid=False, error_message="Invalid URL format" ) # Parse URL for detailed validation try: parsed = urlparse(normalized_url) except Exception as e: return ValidationResult( is_valid=False, error_message=f"Failed to parse URL: {str(e)}" ) # Check scheme if parsed.scheme.lower() not in {'http', 'https'}: if parsed.scheme.lower() in self.blocked_schemes: return ValidationResult( is_valid=False, error_message=f"Blocked scheme: {parsed.scheme}" ) else: return ValidationResult( is_valid=False, error_message=f"Unsupported scheme: {parsed.scheme}. Only HTTP and HTTPS are allowed." ) # Check for blocked domains hostname = parsed.hostname if hostname: hostname_lower = hostname.lower() # Check blocked domains if hostname_lower in self.blocked_domains: return ValidationResult( is_valid=False, error_message=f"Access to {hostname} is not allowed" ) # Check blocked IP ranges for pattern in self.blocked_ip_patterns: if re.match(pattern, hostname_lower): return ValidationResult( is_valid=False, error_message=f"Access to private IP ranges is not allowed" ) # Check URL length if len(normalized_url) > 2048: return ValidationResult( is_valid=False, error_message="URL is too long (max 2048 characters)" ) return ValidationResult( is_valid=True, normalized_url=normalized_url )

config.py:9-53 (helper)

Configuration dataclass providing defaults for timeout, user_agent, content limits, etc., loaded from environment variables.

@dataclass class Config: """Configuration class for the URL search server.""" # Default timeout for HTTP requests (seconds) default_timeout: int = 10 # Maximum timeout allowed (seconds) max_timeout: int = 60 # User agent string for HTTP requests user_agent: str = "MCP-URL-Search-Server/1.0.0" # Maximum content length to process (bytes) max_content_length: int = 10 * 1024 * 1024 # 10MB # Supported content types for text extraction supported_content_types: tuple = ( 'text/html', 'text/plain', 'application/xhtml+xml', 'text/xml', 'application/xml' ) # Rate limiting settings rate_limit_requests: int = 100 # requests per minute rate_limit_window: int = 60 # window in seconds def __post_init__(self): """Post-initialization to override with environment variables.""" # Override with environment variables if present self.default_timeout = int(os.getenv("MCP_DEFAULT_TIMEOUT", self.default_timeout)) self.max_timeout = int(os.getenv("MCP_MAX_TIMEOUT", self.max_timeout)) self.user_agent = os.getenv("MCP_USER_AGENT", self.user_agent) self.max_content_length = int(os.getenv("MCP_MAX_CONTENT_LENGTH", self.max_content_length)) # Validate configuration if self.default_timeout > self.max_timeout: self.default_timeout = self.max_timeout if self.default_timeout < 1: self.default_timeout = 1

WebSurfer MCP

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API