url_scraper.py•3.72 kB
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import html2text
import sys
from urllib.parse import urljoin, urlparse
import os
def get_page_content(url):
"""Fetch page content and return BeautifulSoup object"""
response = requests.get(url)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
def extract_sidebar_urls(soup, base_url):
"""Extract URLs from sidebar-group ul elements"""
urls = []
sidebar_groups = soup.find_all('ul', id='sidebar-group')
for group in sidebar_groups:
links = group.find_all('a', href=True)
for link in links:
full_url = urljoin("https://modelcontextprotocol.io", link['href'])
urls.append(full_url)
return urls
def extract_content_to_markdown(soup):
"""Extract content between header and footer divs and convert to markdown"""
header = soup.find('div', id='header')
footer = soup.find('div', id='footer')
if not header or not footer:
# Fallback to body content if header/footer not found
content = soup.find('body') or soup
else:
# Extract content between header and footer
content_elements = []
current = header.next_sibling
while current and current != footer:
if hasattr(current, 'name'):
content_elements.append(str(current))
current = current.next_sibling
content_html = ''.join(content_elements)
content = BeautifulSoup(content_html, 'html.parser')
# Convert to markdown
h = html2text.HTML2Text()
h.ignore_links = False
h.ignore_images = False
return h.handle(str(content))
def save_markdown(content, url, output_dir='output'):
"""Save markdown content to file"""
os.makedirs(output_dir, exist_ok=True)
# Create filename from URL
parsed = urlparse(url)
filename = parsed.path.strip('/').replace('/', '_') or 'index'
if not filename.endswith('.md'):
filename += '.md'
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"# {url}\n\n")
f.write(content)
print(f"Saved: {filepath}")
def main():
# if len(sys.argv) != 2:
# print("Usage: python url_scraper.py <URL>")
# sys.exit(1)
base_url = "https://modelcontextprotocol.io/specification/2025-06-18"
processed_urls = set()
try:
# Step 1 & 2: Navigate to page and extract sidebar URLs
print(f"Processing main page: {base_url}")
soup = get_page_content(base_url)
sidebar_urls = extract_sidebar_urls(soup, base_url)
print(f"Found {len(sidebar_urls)} sidebar URLs: {sidebar_urls}")
# Step 3: Extract and save main page content
markdown_content = extract_content_to_markdown(soup)
save_markdown(markdown_content, base_url)
processed_urls.add(base_url)
# Step 4: Process each sidebar URL
for url in sidebar_urls:
if url not in processed_urls:
print(f"Processing sidebar page: {url}")
try:
soup = get_page_content(url)
markdown_content = extract_content_to_markdown(soup)
save_markdown(markdown_content, url)
processed_urls.add(url)
except Exception as e:
print(f"Error processing {url}: {e}")
print(f"\nCompleted! Processed {len(processed_urls)} pages.")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()