Skip to main content
Glama

MCP Documentation Server

by McKhanster
url_scraper.py3.72 kB
#!/usr/bin/env python3 import requests from bs4 import BeautifulSoup import html2text import sys from urllib.parse import urljoin, urlparse import os def get_page_content(url): """Fetch page content and return BeautifulSoup object""" response = requests.get(url) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') def extract_sidebar_urls(soup, base_url): """Extract URLs from sidebar-group ul elements""" urls = [] sidebar_groups = soup.find_all('ul', id='sidebar-group') for group in sidebar_groups: links = group.find_all('a', href=True) for link in links: full_url = urljoin("https://modelcontextprotocol.io", link['href']) urls.append(full_url) return urls def extract_content_to_markdown(soup): """Extract content between header and footer divs and convert to markdown""" header = soup.find('div', id='header') footer = soup.find('div', id='footer') if not header or not footer: # Fallback to body content if header/footer not found content = soup.find('body') or soup else: # Extract content between header and footer content_elements = [] current = header.next_sibling while current and current != footer: if hasattr(current, 'name'): content_elements.append(str(current)) current = current.next_sibling content_html = ''.join(content_elements) content = BeautifulSoup(content_html, 'html.parser') # Convert to markdown h = html2text.HTML2Text() h.ignore_links = False h.ignore_images = False return h.handle(str(content)) def save_markdown(content, url, output_dir='output'): """Save markdown content to file""" os.makedirs(output_dir, exist_ok=True) # Create filename from URL parsed = urlparse(url) filename = parsed.path.strip('/').replace('/', '_') or 'index' if not filename.endswith('.md'): filename += '.md' filepath = os.path.join(output_dir, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(f"# {url}\n\n") f.write(content) print(f"Saved: {filepath}") def main(): # if len(sys.argv) != 2: # print("Usage: python url_scraper.py <URL>") # sys.exit(1) base_url = "https://modelcontextprotocol.io/specification/2025-06-18" processed_urls = set() try: # Step 1 & 2: Navigate to page and extract sidebar URLs print(f"Processing main page: {base_url}") soup = get_page_content(base_url) sidebar_urls = extract_sidebar_urls(soup, base_url) print(f"Found {len(sidebar_urls)} sidebar URLs: {sidebar_urls}") # Step 3: Extract and save main page content markdown_content = extract_content_to_markdown(soup) save_markdown(markdown_content, base_url) processed_urls.add(base_url) # Step 4: Process each sidebar URL for url in sidebar_urls: if url not in processed_urls: print(f"Processing sidebar page: {url}") try: soup = get_page_content(url) markdown_content = extract_content_to_markdown(soup) save_markdown(markdown_content, url) processed_urls.add(url) except Exception as e: print(f"Error processing {url}: {e}") print(f"\nCompleted! Processed {len(processed_urls)} pages.") except Exception as e: print(f"Error: {e}") sys.exit(1) if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/McKhanster/mcp-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server