Skip to main content
Glama
davinoishi

Broken Link Checker MCP Server

by davinoishi
robots_parser.py3.48 kB
"""Parse and respect robots.txt files.""" import logging from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser import httpx logger = logging.getLogger(__name__) class RobotsParser: """Parse and check robots.txt compliance.""" def __init__(self, user_agent: str = "BrokenLinkChecker/1.0"): """ Initialize the robots.txt parser. Args: user_agent: The user agent string to use for robots.txt checks """ self.user_agent = user_agent self._parsers: dict[str, RobotFileParser] = {} def _get_robots_url(self, url: str) -> str: """ Get the robots.txt URL for a given URL. Args: url: Any URL on the domain Returns: The robots.txt URL for that domain """ parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" return robots_url async def _fetch_and_parse(self, domain: str) -> RobotFileParser: """ Fetch and parse robots.txt for a domain. Args: domain: The domain URL Returns: Parsed RobotFileParser instance """ robots_url = self._get_robots_url(domain) parser = RobotFileParser() parser.set_url(robots_url) try: async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get(robots_url) if response.status_code == 200: # Parse the robots.txt content lines = response.text.splitlines() parser.parse(lines) logger.info(f"Successfully parsed robots.txt for {domain}") else: logger.info( f"No robots.txt found for {domain} (status {response.status_code}), allowing all" ) # If no robots.txt, allow everything parser.parse([]) except Exception as e: logger.warning(f"Failed to fetch robots.txt for {domain}: {e}, allowing all") # On error, allow everything parser.parse([]) return parser async def can_fetch(self, url: str) -> bool: """ Check if the URL can be fetched according to robots.txt. Args: url: The URL to check Returns: True if the URL can be fetched, False otherwise """ parsed = urlparse(url) domain = f"{parsed.scheme}://{parsed.netloc}" # Get or create parser for this domain if domain not in self._parsers: self._parsers[domain] = await self._fetch_and_parse(domain) parser = self._parsers[domain] return parser.can_fetch(self.user_agent, url) async def get_crawl_delay(self, url: str) -> float | None: """ Get the crawl delay specified in robots.txt for this domain. Args: url: Any URL on the domain Returns: Crawl delay in seconds, or None if not specified """ parsed = urlparse(url) domain = f"{parsed.scheme}://{parsed.netloc}" # Get or create parser for this domain if domain not in self._parsers: self._parsers[domain] = await self._fetch_and_parse(domain) parser = self._parsers[domain] delay = parser.crawl_delay(self.user_agent) return delay

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/davinoishi/BLC-ground'

If you have feedback or need assistance with the MCP directory API, please join our Discord server