"""Rate limiting for polite web crawling."""
import asyncio
import time
from collections import defaultdict
from urllib.parse import urlparse
class RateLimiter:
"""Rate limiter to control request frequency per domain."""
def __init__(self, default_delay: float = 1.0):
"""
Initialize the rate limiter.
Args:
default_delay: Default delay between requests in seconds (default: 1.0)
"""
self.default_delay = default_delay
self._last_request_time: dict[str, float] = defaultdict(float)
self._domain_delays: dict[str, float] = {}
self._locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
def set_domain_delay(self, domain: str, delay: float):
"""
Set a custom delay for a specific domain.
Args:
domain: The domain (e.g., "example.com")
delay: Delay in seconds between requests to this domain
"""
self._domain_delays[domain] = delay
def _get_domain(self, url: str) -> str:
"""
Extract domain from URL.
Args:
url: The URL to extract domain from
Returns:
The domain (netloc)
"""
parsed = urlparse(url)
return parsed.netloc
def _get_delay(self, domain: str) -> float:
"""
Get the delay for a specific domain.
Args:
domain: The domain
Returns:
Delay in seconds
"""
return self._domain_delays.get(domain, self.default_delay)
async def acquire(self, url: str):
"""
Acquire permission to make a request, waiting if necessary.
Args:
url: The URL to request
"""
domain = self._get_domain(url)
delay = self._get_delay(domain)
# Get lock for this domain to prevent concurrent requests
async with self._locks[domain]:
last_time = self._last_request_time[domain]
current_time = time.time()
time_since_last = current_time - last_time
if time_since_last < delay:
# Need to wait
wait_time = delay - time_since_last
await asyncio.sleep(wait_time)
# Update last request time
self._last_request_time[domain] = time.time()