Skip to main content
Glama

URL Reputation and Validity Checker

by prismon
history.py6.97 kB
"""Domain history and reputation checking.""" import asyncio from datetime import datetime, timezone from typing import Any, Dict import tldextract import waybackpy import whois from .models import DomainHistory, URLValidationResult class DomainHistoryChecker: """Check domain history using various sources.""" def __init__(self, user_agent: str = "URL-Reputation-Checker/1.0"): self.user_agent = user_agent async def get_domain_history(self, url: str) -> DomainHistory: """Get comprehensive domain history.""" # Extract domain from URL extracted = tldextract.extract(url) domain = f"{extracted.domain}.{extracted.suffix}" # Run all checks concurrently results = await asyncio.gather( self._get_whois_info(domain), self._get_wayback_info(url), return_exceptions=True, ) whois_info = results[0] if not isinstance(results[0], Exception) else {} wayback_info = results[1] if not isinstance(results[1], Exception) else {} # Calculate domain age creation_date = whois_info.get("creation_date") age_days = None if creation_date: age_days = (datetime.now(timezone.utc) - creation_date).days return DomainHistory( domain=domain, creation_date=creation_date, expiration_date=whois_info.get("expiration_date"), registrar=whois_info.get("registrar"), wayback_first_snapshot=wayback_info.get("first_snapshot"), wayback_total_snapshots=wayback_info.get("total_snapshots", 0), age_days=age_days, ) async def _get_whois_info(self, domain: str) -> Dict[str, Any]: """Get WHOIS information for a domain.""" try: # Run WHOIS lookup in thread pool to avoid blocking loop = asyncio.get_event_loop() w = await loop.run_in_executor(None, whois.whois, domain) result = {} # Handle creation date if hasattr(w, "creation_date"): creation_date = w.creation_date if isinstance(creation_date, list): creation_date = creation_date[0] if creation_date: result["creation_date"] = self._ensure_timezone(creation_date) # Handle expiration date if hasattr(w, "expiration_date"): expiration_date = w.expiration_date if isinstance(expiration_date, list): expiration_date = expiration_date[0] if expiration_date: result["expiration_date"] = self._ensure_timezone(expiration_date) # Get registrar if hasattr(w, "registrar"): result["registrar"] = w.registrar return result except Exception: # WHOIS lookup can fail for many reasons return {} async def _get_wayback_info(self, url: str) -> Dict[str, Any]: """Get Wayback Machine information.""" try: # Create Wayback object loop = asyncio.get_event_loop() def get_wayback_data(): wb = waybackpy.Url(url, self.user_agent) # Get oldest archive try: oldest = wb.oldest() oldest_date = None if oldest and hasattr(oldest, "timestamp"): # Parse wayback timestamp (YYYYMMDDHHMMSS) ts = str(oldest.timestamp) if len(ts) >= 8: year = ts[0:4] month = ts[4:6] day = ts[6:8] oldest_date = datetime( int(year), int(month), int(day), tzinfo=timezone.utc ) except Exception: oldest_date = None # Get total number of snapshots try: cdx = wb.cdx_api() total = len(list(cdx)) except Exception: total = 0 return {"first_snapshot": oldest_date, "total_snapshots": total} # Run in executor to avoid blocking result = await loop.run_in_executor(None, get_wayback_data) return result except Exception: return {} def _ensure_timezone(self, dt: datetime) -> datetime: """Ensure datetime has timezone information.""" if dt and dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt def calculate_reputation_score( self, domain_history: DomainHistory, validation_result: URLValidationResult ) -> float: """ Calculate reputation score based on multiple factors. Scoring breakdown: - Domain age: 0-30 points - Wayback presence: 0-20 points - Technical factors: 0-25 points - Consistency: 0-25 points """ score = 0.0 # Domain age score (0-30 points) if domain_history.age_days: if domain_history.age_days >= 365 * 5: # 5+ years score += 30 elif domain_history.age_days >= 365 * 2: # 2-5 years score += 20 elif domain_history.age_days >= 365: # 1-2 years score += 15 elif domain_history.age_days >= 180: # 6-12 months score += 10 elif domain_history.age_days >= 90: # 3-6 months score += 5 else: # Less than 3 months score += 2 # Wayback Machine presence (0-20 points) if domain_history.wayback_total_snapshots > 0: if domain_history.wayback_total_snapshots >= 100: score += 20 elif domain_history.wayback_total_snapshots >= 50: score += 15 elif domain_history.wayback_total_snapshots >= 20: score += 10 elif domain_history.wayback_total_snapshots >= 5: score += 5 else: score += 2 # Technical factors (0-25 points) if validation_result.ssl_valid: score += 10 if validation_result.response_time < 1.0: score += 10 elif validation_result.response_time < 2.0: score += 5 if validation_result.status_code == 200: score += 5 # Consistency factors (0-25 points) if len(validation_result.warnings) == 0: score += 25 elif len(validation_result.warnings) == 1: score += 15 elif len(validation_result.warnings) == 2: score += 10 elif len(validation_result.warnings) == 3: score += 5 return min(score, 100.0) # Cap at 100

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/prismon/reputation-checker-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server