"""Parse and respect robots.txt files."""
import logging
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import httpx
logger = logging.getLogger(__name__)
class RobotsParser:
"""Parse and check robots.txt compliance."""
def __init__(self, user_agent: str = "BrokenLinkChecker/1.0"):
"""
Initialize the robots.txt parser.
Args:
user_agent: The user agent string to use for robots.txt checks
"""
self.user_agent = user_agent
self._parsers: dict[str, RobotFileParser] = {}
def _get_robots_url(self, url: str) -> str:
"""
Get the robots.txt URL for a given URL.
Args:
url: Any URL on the domain
Returns:
The robots.txt URL for that domain
"""
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
return robots_url
async def _fetch_and_parse(self, domain: str) -> RobotFileParser:
"""
Fetch and parse robots.txt for a domain.
Args:
domain: The domain URL
Returns:
Parsed RobotFileParser instance
"""
robots_url = self._get_robots_url(domain)
parser = RobotFileParser()
parser.set_url(robots_url)
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(robots_url)
if response.status_code == 200:
# Parse the robots.txt content
lines = response.text.splitlines()
parser.parse(lines)
logger.info(f"Successfully parsed robots.txt for {domain}")
else:
logger.info(
f"No robots.txt found for {domain} (status {response.status_code}), allowing all"
)
# If no robots.txt, allow everything
parser.parse([])
except Exception as e:
logger.warning(f"Failed to fetch robots.txt for {domain}: {e}, allowing all")
# On error, allow everything
parser.parse([])
return parser
async def can_fetch(self, url: str) -> bool:
"""
Check if the URL can be fetched according to robots.txt.
Args:
url: The URL to check
Returns:
True if the URL can be fetched, False otherwise
"""
parsed = urlparse(url)
domain = f"{parsed.scheme}://{parsed.netloc}"
# Get or create parser for this domain
if domain not in self._parsers:
self._parsers[domain] = await self._fetch_and_parse(domain)
parser = self._parsers[domain]
return parser.can_fetch(self.user_agent, url)
async def get_crawl_delay(self, url: str) -> float | None:
"""
Get the crawl delay specified in robots.txt for this domain.
Args:
url: Any URL on the domain
Returns:
Crawl delay in seconds, or None if not specified
"""
parsed = urlparse(url)
domain = f"{parsed.scheme}://{parsed.netloc}"
# Get or create parser for this domain
if domain not in self._parsers:
self._parsers[domain] = await self._fetch_and_parse(domain)
parser = self._parsers[domain]
delay = parser.crawl_delay(self.user_agent)
return delay