lib.py•1.79 kB
import sys
import urllib.request
import urllib.error
from urllib.parse import urljoin, urlparse
import os
import re
def get_sitemap(website_url):
"""
Fetches the sitemap.xml from a given website URL.
Args:
website_url: The base URL of the website (e.g., 'https://example.com')
Returns:
The sitemap XML content as a string, or None if not found.
"""
# Ensure the URL has a scheme
if not urlparse(website_url).scheme:
website_url = 'https://' + website_url
# Try common sitemap locations
sitemap_paths = [
'/sitemap.xml',
'/sitemap_index.xml',
'/sitemap',
]
for path in sitemap_paths:
sitemap_url = urljoin(website_url, path)
try:
print(f"Trying: {sitemap_url}")
with urllib.request.urlopen(sitemap_url, timeout=10) as response:
if response.status == 200:
content = response.read().decode('utf-8')
print(f"\n✓ Found sitemap at: {sitemap_url}\n")
return content
except urllib.error.HTTPError as e:
print(f" ✗ {e.code} - Not found")
except urllib.error.URLError as e:
print(f" ✗ Error: {e.reason}")
except Exception as e:
print(f" ✗ Error: {str(e)}")
return None
def extract_json(text: str) -> str:
"""Extract JSON from text, handling markdown code blocks."""
# Try to find JSON in code blocks first
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
return json_match.group(1)
# Try to find raw JSON object
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
return json_match.group(0)
return text