Skip to main content
Glama
billallison

URL Text Fetcher MCP Server

by billallison

fetch_url_text

Extract visible text content from any web URL to access readable information for analysis or processing.

Instructions

Download all visible text from a URL.

Args: url: The URL to fetch text from

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYes

Implementation Reference

  • Primary handler function for the 'fetch_url_text' tool. Decorated with @mcp.tool() for registration, sanitizes input URL and delegates to fetch_url_content helper.
    @mcp.tool()
    async def fetch_url_text(url: str) -> str:
        """Download all visible text from a URL.
        
        Args:
            url: The URL to fetch text from
        """
        # Sanitize URL input
        url = sanitize_url(url)
        if not url:
            return "Error: Invalid URL format"
            
        logger.info(f"Fetching URL text: {url}")
        content = fetch_url_content(url)
        
        return f"Text content from {url}:\n\n{content}"
  • Core helper utility that performs the actual URL fetching, security validation (SSRF protection), content streaming with size limits, BeautifulSoup parsing to extract visible text, and truncation.
    def fetch_url_content(url: str) -> str:
        """Helper function to fetch text content from a URL with safety checks."""
        # Validate URL safety first
        if not is_safe_url(url):
            logger.warning(f"SECURITY: Blocked unsafe URL: {url}")
            return "Error: URL not allowed for security reasons"
        
        try:
            # Log request for monitoring
            logger.info(f"REQUEST: Fetching content from {url}")
            
            # Make request with streaming to check size
            resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True)
            resp.raise_for_status()
            
            # Log response details
            logger.info(f"RESPONSE: {resp.status_code} from {url}, Content-Type: {resp.headers.get('Content-Type', 'unknown')}")
            
            # Check content length header
            content_length = resp.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_RESPONSE_SIZE:
                logger.warning(f"SECURITY: Content too large: {content_length} bytes for {url}")
                return f"Error: Content too large ({content_length} bytes, max {MAX_RESPONSE_SIZE})"
    
            # Read content with size limit
            content_chunks = []
            total_size = 0
            
            try:
                for chunk in resp.iter_content(chunk_size=8192, decode_unicode=True):
                    if chunk:  # filter out keep-alive new chunks
                        total_size += len(chunk)
                        if total_size > MAX_RESPONSE_SIZE:
                            logger.warning(f"SECURITY: Content exceeded size limit for {url}")
                            return f"Error: Content exceeded size limit ({MAX_RESPONSE_SIZE} bytes)"
                        content_chunks.append(chunk)
            except UnicodeDecodeError:
                # If we can't decode as text, it's probably binary content
                logger.warning(f"CONTENT: Unable to decode content as text from {url}")
                return "Error: Unable to decode content as text"
            
            html_content = ''.join(content_chunks)
            
            # Parse with BeautifulSoup
            soup = BeautifulSoup(html_content, "html.parser")
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
                
            text_content = soup.get_text(separator="\n", strip=True)
            
            # Limit final content length
            if len(text_content) > CONTENT_LENGTH_LIMIT:
                logger.info(f"CONTENT: Truncating content from {url} ({len(text_content)} -> {CONTENT_LENGTH_LIMIT} chars)")
                text_content = text_content[:CONTENT_LENGTH_LIMIT] + "... [Content truncated]"
            
            logger.info(f"SUCCESS: Fetched {len(text_content)} characters from {url}")
            return text_content
            
        except requests.RequestException as e:
            logger.error(f"REQUEST_ERROR: Failed to fetch {url}: {e}")
            return "Error: Unable to fetch URL content"
        except Exception as e:
            logger.error(f"UNEXPECTED_ERROR: Processing {url}: {e}", exc_info=True)
            return "Error: An unexpected error occurred while processing the URL"
    
    def brave_search(query: str, count: int = 10) -> List[dict]:
        """Perform a Brave search and return results with thread-safe rate limiting."""
        if not BRAVE_API_KEY:
  • The tool is listed in the get_server_info tool's output as an available tool, confirming its registration.
        "• fetch_url_text - Download visible text from any URL",
        "• fetch_page_links - Extract all links from a webpage", 
        "• brave_search_and_fetch - Search web and fetch content from top results",
        "• test_brave_search - Test Brave Search API connectivity",
        "• get_server_info - Display this server information",
        "",
        "Security Features:",
        "• SSRF protection against internal network access",
        "• Input sanitization for URLs and search queries",
        "• Content size limiting and memory protection",
        "• Thread-safe rate limiting for API requests",
        "",
        f"Brave API Key: {'✓ Configured' if BRAVE_API_KEY else '✗ Missing'}"
    ]
    
    return "\n".join(info)
  • Alternative implementation with Pydantic Field defining the input schema for the URL parameter.
    def fetch_url_text(url: str = Field(description="The URL to fetch text from")) -> str:
  • Alternative synchronous handler in server_fastmcp.py using Pydantic Field for input validation.
    @mcp.tool()
    def fetch_url_text(url: str = Field(description="The URL to fetch text from")) -> str:
        """Download all visible text from a URL"""
        # Sanitize URL input
        url = sanitize_url(url)
        if not url:
            return "Error: Invalid URL format"
            
        logger.info(f"Fetching URL text: {url}")
        content = fetch_url_content(url)
        
        return f"Text content from {url}:\n\n{content}"

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/billallison/brsearch-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server