"""Validate links by checking their HTTP status."""
import asyncio
import logging
from typing import Literal
import httpx
logger = logging.getLogger(__name__)
class LinkValidator:
"""Validate links by performing HTTP requests."""
def __init__(self, timeout: float = 10.0, max_redirects: int = 10):
"""
Initialize the link validator.
Args:
timeout: Request timeout in seconds (default: 10.0)
max_redirects: Maximum number of redirects to follow (default: 10)
"""
self.timeout = timeout
self.max_redirects = max_redirects
async def validate_link(self, url: str) -> Literal["Good", "Bad"]:
"""
Validate a single link.
Args:
url: The URL to validate
Returns:
"Good" if the link is valid (2xx or 3xx), "Bad" otherwise
"""
try:
async with httpx.AsyncClient(
follow_redirects=True,
max_redirects=self.max_redirects,
timeout=self.timeout,
) as client:
# Use HEAD request first for efficiency
try:
response = await client.head(url)
except (httpx.UnsupportedProtocol, httpx.InvalidURL):
# Some servers don't support HEAD, try GET
# Also catch invalid URLs
return "Bad"
# Some servers return 405 for HEAD but work with GET
if response.status_code == 405:
response = await client.get(url)
# 2xx and 3xx are considered good
# (3xx shouldn't happen with follow_redirects=True, but just in case)
if 200 <= response.status_code < 400:
return "Good"
else:
logger.debug(
f"Link {url} returned status {response.status_code}"
)
return "Bad"
except httpx.TimeoutException:
logger.debug(f"Link {url} timed out after {self.timeout}s")
return "Bad"
except httpx.ConnectError:
logger.debug(f"Link {url} failed to connect (DNS or connection error)")
return "Bad"
except httpx.HTTPError as e:
logger.debug(f"Link {url} failed with HTTP error: {e}")
return "Bad"
except Exception as e:
logger.debug(f"Link {url} failed with unexpected error: {e}")
return "Bad"
async def validate_links(self, urls: list[str]) -> dict[str, Literal["Good", "Bad"]]:
"""
Validate multiple links concurrently.
Args:
urls: List of URLs to validate
Returns:
Dictionary mapping URLs to their validation status
"""
# Create tasks for all URLs
tasks = [self.validate_link(url) for url in urls]
# Run all validations concurrently
results = await asyncio.gather(*tasks)
# Create mapping of URL to status
return dict(zip(urls, results))