lib.py•1.79 kB
import sys
import urllib.request
import urllib.error
from urllib.parse import urljoin, urlparse
import os
import re
def get_sitemap(website_url):
    """
    Fetches the sitemap.xml from a given website URL.
    Args:
        website_url: The base URL of the website (e.g., 'https://example.com')
    Returns:
        The sitemap XML content as a string, or None if not found.
    """
    # Ensure the URL has a scheme
    if not urlparse(website_url).scheme:
        website_url = 'https://' + website_url
    # Try common sitemap locations
    sitemap_paths = [
        '/sitemap.xml',
        '/sitemap_index.xml',
        '/sitemap',
    ]
    for path in sitemap_paths:
        sitemap_url = urljoin(website_url, path)
        try:
            print(f"Trying: {sitemap_url}")
            with urllib.request.urlopen(sitemap_url, timeout=10) as response:
                if response.status == 200:
                    content = response.read().decode('utf-8')
                    print(f"\n✓ Found sitemap at: {sitemap_url}\n")
                    return content
        except urllib.error.HTTPError as e:
            print(f"  ✗ {e.code} - Not found")
        except urllib.error.URLError as e:
            print(f"  ✗ Error: {e.reason}")
        except Exception as e:
            print(f"  ✗ Error: {str(e)}")
    return None
def extract_json(text: str) -> str:
    """Extract JSON from text, handling markdown code blocks."""
    # Try to find JSON in code blocks first
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1)
    # Try to find raw JSON object
    json_match = re.search(r'\{.*\}', text, re.DOTALL)
    if json_match:
        return json_match.group(0)
    return text