"""Robots.txt checking utility."""
import logging
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import requests
from ..constants import PROXY_URL
def check_robots_txt(url: str, user_agent: str, use_proxy: bool = True) -> bool:
"""Check if robots.txt allows access to the URL.
Args:
url: The URL to check.
user_agent: User agent string.
use_proxy: Whether to use proxy for the request.
Returns:
True if allowed, False if disallowed.
"""
try:
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
# Configure proxy
proxies = None
if use_proxy:
proxies = {
"http": PROXY_URL,
"https": PROXY_URL,
}
# Set shorter timeout
try:
# Note: RobotFileParser doesn't directly support proxy, need to fetch manually
session = requests.Session()
response = session.get(robots_url, timeout=5, proxies=proxies)
rp.parse(response.text.splitlines())
except Exception:
# If unable to read robots.txt, assume allowed
logging.warning(f"Cannot read robots.txt: {robots_url}, assuming allowed")
return True
can_fetch = rp.can_fetch(user_agent, url)
logging.info(f"Robots.txt check - URL: {url}, allowed: {can_fetch}")
return can_fetch
except Exception as e:
logging.error(f"Robots.txt check failed: {e}")
# Assume allowed on error
return True