Skip to main content
Glama
billallison

URL Text Fetcher MCP Server

by billallison

fetch_page_links

Extract all links from a web page by providing its URL. This tool helps identify and collect hyperlinks for web scraping, content analysis, or navigation purposes.

Instructions

Return a list of all links on the page.

Args: url: The URL to fetch links from

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYes

Implementation Reference

  • Primary implementation of the fetch_page_links tool handler. Downloads the webpage HTML, parses it using BeautifulSoup, extracts all href attributes from anchor tags, filters valid links (http/https or relative), and returns a formatted list limited to the first 100 links.
    @mcp.tool()
    async def fetch_page_links(url: str) -> str:
        """Return a list of all links on the page.
        
        Args:
            url: The URL to fetch links from
        """
        # Sanitize URL input
        url = sanitize_url(url)
        if not url:
            return "Error: Invalid URL format"
        
        # Validate URL safety
        if not is_safe_url(url):
            logger.warning(f"Blocked unsafe URL for link fetching: {url}")
            return "Error: URL not allowed for security reasons"
            
        try:
            logger.info(f"Fetching page links: {url}")
            resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True)
            resp.raise_for_status()
            
            # Check content length
            content_length = resp.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_RESPONSE_SIZE:
                return f"Error: Page too large ({content_length} bytes)"
            
            # Read content with size limit
            content_chunks = []
            total_size = 0
            
            for chunk in resp.iter_content(chunk_size=8192, decode_unicode=True):
                if chunk:
                    total_size += len(chunk)
                    if total_size > MAX_RESPONSE_SIZE:
                        return "Error: Page content too large"
                    content_chunks.append(chunk)
            
            html_content = ''.join(content_chunks)
            soup = BeautifulSoup(html_content, "html.parser")
            links = [a.get('href') for a in soup.find_all('a', href=True) if a.get('href')]
            
            # Filter and clean links
            valid_links = []
            for link in links:
                if link.startswith(('http://', 'https://', '/')):
                    valid_links.append(link)
            
            links_text = "\n".join(f"- {link}" for link in valid_links[:100])  # Limit to 100 links
            
            return f"Links found on {url} ({len(valid_links)} total, showing first 100):\n\n{links_text}"
            
        except requests.RequestException as e:
            logger.error(f"Request failed for {url}: {e}")
            return "Error: Unable to fetch page"
        except Exception as e:
            logger.error(f"Unexpected error fetching links from {url}: {e}", exc_info=True)
            return "Error: Unable to process page"
  • Alternative implementation of the fetch_page_links tool handler for FastMCP, including Pydantic Field for input schema validation. Identical logic to the primary handler.
    @mcp.tool()
    def fetch_page_links(url: str = Field(description="The URL to fetch links from")) -> str:
        """Return a list of all links on the page"""
        # Sanitize URL input
        url = sanitize_url(url)
        if not url:
            return "Error: Invalid URL format"
        
        # Validate URL safety
        if not is_safe_url(url):
            logger.warning(f"Blocked unsafe URL for link fetching: {url}")
            return "Error: URL not allowed for security reasons"
            
        try:
            logger.info(f"Fetching page links: {url}")
            resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, stream=True)
            resp.raise_for_status()
            
            # Check content length
            content_length = resp.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_RESPONSE_SIZE:
                return f"Error: Page too large ({content_length} bytes)"
            
            # Read content with size limit
            content_chunks = []
            total_size = 0
            
            for chunk in resp.iter_content(chunk_size=8192, decode_unicode=True):
                if chunk:
                    total_size += len(chunk)
                    if total_size > MAX_RESPONSE_SIZE:
                        return "Error: Page content too large"
                    content_chunks.append(chunk)
            
            html_content = ''.join(content_chunks)
            soup = BeautifulSoup(html_content, "html.parser")
            links = [a.get('href') for a in soup.find_all('a', href=True) if a.get('href')]
            
            # Filter and clean links
            valid_links = []
            for link in links:
                if link.startswith(('http://', 'https://', '/')):
                    valid_links.append(link)
    
            links_text = "\n".join(f"- {link}" for link in valid_links[:100])  # Limit to 100 links
            
            return f"Links found on {url} ({len(valid_links)} total, showing first 100):\n\n{links_text}"
            
        except requests.RequestException as e:
            logger.error(f"Request failed for {url}: {e}")
            return "Error: Unable to fetch page"
        except Exception as e:
            logger.error(f"Unexpected error fetching links from {url}: {e}", exc_info=True)
            return "Error: Unable to process page"
  • The tool is listed in the get_server_info tool's output as an available tool, serving as informal registration documentation.
    @mcp.tool()
    async def get_server_info() -> str:
        """Get information about this MCP server including version, implementation, and capabilities.
        
        Returns:
            Server information including version, implementation type, and available features
        """
        info = [
            f"URL Text Fetcher MCP Server",
            f"Version: {__version__}",
            f"Implementation: {__implementation__}",
            f"Brave Search Rate Limit: {BRAVE_RATE_LIMIT_RPS} requests/second",
            f"Request Timeout: {REQUEST_TIMEOUT} seconds",
            f"Content Limit: {CONTENT_LENGTH_LIMIT:,} characters",
            f"Max Response Size: {MAX_RESPONSE_SIZE:,} bytes",
            "",
            "Available Tools:",
            "• fetch_url_text - Download visible text from any URL",
            "• fetch_page_links - Extract all links from a webpage", 
            "• brave_search_and_fetch - Search web and fetch content from top results",
            "• test_brave_search - Test Brave Search API connectivity",
            "• get_server_info - Display this server information",
            "",
            "Security Features:",
            "• SSRF protection against internal network access",
            "• Input sanitization for URLs and search queries",
            "• Content size limiting and memory protection",
            "• Thread-safe rate limiting for API requests",
            "",
            f"Brave API Key: {'✓ Configured' if BRAVE_API_KEY else '✗ Missing'}"
        ]
        
        return "\n".join(info)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/billallison/brsearch-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server