Webpage MCP Server

EXAMPLES.md•12.9 kB

# Usage Examples Practical examples for using the Webpage MCP Server. ## Table of Contents - [Basic Usage](#basic-usage) - [Rate Limiting Examples](#rate-limiting-examples) - [Integration Examples](#integration-examples) - [Common Patterns](#common-patterns) --- ## Basic Usage ### Example 1: List All Pages Get all available pages from the sitemap: ```python from src.main import list_pages # Get all pages pages = list_pages() print(f"Found {len(pages)} pages") for page in pages: print(f" - {page}") # Output: # Found 5 pages # - / # - /blog # - /blog/yc-ankit-gupta-interview # - /marketplace # - /pricing ``` ### Example 2: Fetch Homepage HTML Retrieve the HTML content of the homepage: ```python from src.main import get_page # Fetch homepage result = get_page("/") if "error" not in result: print(f"URL: {result['url']}") print(f"Status: {result['status_code']}") print(f"Content-Type: {result['content_type']}") print(f"\nHTML Preview:") print(result['html'][:200] + "...") else: print(f"Error: {result['message']}") # Output: # URL: https://example.com/ # Status: 200 # Content-Type: text/html # # HTML Preview: # <!DOCTYPE html> # <html> # <head> # <title>Example Domain</title> # ... ``` ### Example 3: Access Sitemap Resource Get the raw sitemap XML: ```python from src.main import get_sitemap sitemap_content = get_sitemap() print(sitemap_content) # Output: # <?xml version="1.0" encoding="UTF-8"?> # <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> # <url> # <loc>https://example.com/</loc> # ... # </url> # </urlset> ``` --- ## Rate Limiting Examples ### Example 4: Handle Rate Limits Properly handle rate limit errors: ```python from src.main import get_page import time def fetch_with_retry(path, user_id="default", max_retries=3): """Fetch page with automatic retry on rate limit""" for attempt in range(max_retries): result = get_page(path, user_id=user_id) if "error" in result and result["error"] == "Rate limit exceeded": wait_time = result["reset_in_seconds"] print(f"Rate limited. Waiting {wait_time} seconds...") time.sleep(wait_time) continue return result raise Exception("Max retries exceeded") # Usage page = fetch_with_retry("/blog", user_id="user123") print(page["html"]) ``` ### Example 5: Multi-User Rate Limiting Different users have separate rate limits: ```python from src.main import get_page # User 1 makes requests for i in range(5): result = get_page("/", user_id="user1") print(f"User1 request {i+1}: {result.get('status_code', 'rate limited')}") # User 2 has their own limit for i in range(5): result = get_page("/", user_id="user2") print(f"User2 request {i+1}: {result.get('status_code', 'rate limited')}") # Both users can make 10 requests each ``` --- ## Integration Examples ### Example 6: Parse Sitemap and Fetch All Pages Fetch HTML for all pages in the sitemap: ```python from src.main import list_pages, get_page import time def fetch_all_pages(delay=0.5): """Fetch all pages with delay between requests""" pages = list_pages() results = {} for page in pages: print(f"Fetching {page}...") result = get_page(page) if "error" in result: print(f" Error: {result['message']}") results[page] = None else: print(f" Success: {len(result['html'])} bytes") results[page] = result time.sleep(delay) # Avoid rate limiting return results # Fetch all pages all_pages = fetch_all_pages() print(f"\nSuccessfully fetched {sum(1 for v in all_pages.values() if v)} pages") ``` ### Example 7: Extract Specific Data from Pages Extract titles from all blog posts: ```python from src.main import list_pages, get_page from bs4 import BeautifulSoup import re def get_blog_titles(): """Extract titles from blog posts""" pages = list_pages() blog_posts = [p for p in pages if p.startswith('/blog/')] titles = {} for post in blog_posts: result = get_page(post) if "error" not in result: soup = BeautifulSoup(result['html'], 'html.parser') title = soup.find('title') if title: titles[post] = title.get_text() return titles # Get all blog titles titles = get_blog_titles() for path, title in titles.items(): print(f"{path}: {title}") ``` ### Example 8: Content Analysis Analyze content across multiple pages: ```python from src.main import list_pages, get_page from collections import Counter import re def analyze_content(): """Analyze word frequency across all pages""" pages = list_pages() all_words = [] for page in pages: result = get_page(page) if "error" not in result: # Simple word extraction (remove HTML tags first) text = re.sub(r'<[^>]+>', '', result['html']) words = re.findall(r'\b\w+\b', text.lower()) all_words.extend(words) # Get most common words word_freq = Counter(all_words) return word_freq.most_common(20) # Analyze content top_words = analyze_content() print("Top 20 words:") for word, count in top_words: print(f" {word}: {count}") ``` --- ## Common Patterns ### Example 9: Caching Pages Locally Cache fetched pages to avoid repeated requests: ```python from src.main import get_page from pathlib import Path import json import hashlib class PageCache: def __init__(self, cache_dir="./cache"): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) def _get_cache_path(self, path): # Create filename from path hash hash_key = hashlib.md5(path.encode()).hexdigest() return self.cache_dir / f"{hash_key}.json" def get(self, path, user_id=None): cache_path = self._get_cache_path(path) # Check cache if cache_path.exists(): with open(cache_path) as f: return json.load(f) # Fetch and cache result = get_page(path, user_id) if "error" not in result: with open(cache_path, 'w') as f: json.dump(result, f) return result # Usage cache = PageCache() page = cache.get("/blog") # Fetches from server page = cache.get("/blog") # Returns from cache ``` ### Example 10: Batch Processing with Progress Process multiple pages with progress tracking: ```python from src.main import list_pages, get_page from tqdm import tqdm import time def batch_process_pages(processor_func, delay=0.5): """Process pages in batches with progress bar""" pages = list_pages() results = [] for page in tqdm(pages, desc="Processing pages"): result = get_page(page) if "error" not in result: processed = processor_func(result) results.append({ 'path': page, 'data': processed }) else: tqdm.write(f"Error on {page}: {result['message']}") time.sleep(delay) return results # Example processor def extract_metadata(page_result): return { 'url': page_result['url'], 'size': len(page_result['html']), 'content_type': page_result['content_type'] } # Process all pages metadata = batch_process_pages(extract_metadata) ``` ### Example 11: Error Handling Pattern Robust error handling for production use: ```python from src.main import get_page import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def safe_get_page(path, user_id=None, max_retries=3): """Safely fetch page with comprehensive error handling""" for attempt in range(max_retries): try: result = get_page(path, user_id) # Handle rate limit if "error" in result: if result["error"] == "Rate limit exceeded": wait_time = result["reset_in_seconds"] logger.warning(f"Rate limited. Retry {attempt+1}/{max_retries}") time.sleep(wait_time) continue else: logger.error(f"Error fetching {path}: {result['message']}") return None logger.info(f"Successfully fetched {path}") return result except Exception as e: logger.error(f"Unexpected error on attempt {attempt+1}: {e}") if attempt == max_retries - 1: raise time.sleep(1) return None # Usage page = safe_get_page("/blog/post-1", user_id="user123") if page: print(f"Got {len(page['html'])} bytes") ``` ### Example 12: Testing Server Responses Test suite for validating server responses: ```python from src.main import list_pages, get_page, get_sitemap def test_server(): """Test all server functionality""" # Test 1: List pages print("Test 1: Listing pages...") pages = list_pages() assert len(pages) > 0, "Should have pages" assert all(p.startswith('/') for p in pages), "All paths should start with /" print(f"✓ Found {len(pages)} pages") # Test 2: Get sitemap print("\nTest 2: Getting sitemap...") sitemap = get_sitemap() assert '<?xml' in sitemap, "Should be valid XML" assert 'urlset' in sitemap, "Should contain urlset" print(f"✓ Sitemap is {len(sitemap)} bytes") # Test 3: Fetch page print("\nTest 3: Fetching page...") result = get_page(pages[0]) assert "error" not in result, "Should fetch successfully" assert result['status_code'] == 200, "Should return 200" assert len(result['html']) > 0, "Should have HTML content" print(f"✓ Page fetched: {result['url']}") # Test 4: Rate limiting print("\nTest 4: Testing rate limit...") for i in range(12): result = get_page("/", user_id="test") if "error" in result and result["error"] == "Rate limit exceeded": print(f"✓ Rate limit triggered at request {i+1}") break print("\n✅ All tests passed!") # Run tests test_server() ``` --- ## Advanced Usage ### Example 13: Sitemap Parser Custom sitemap parser with filtering: ```python from src.main import get_sitemap import xml.etree.ElementTree as ET from datetime import datetime def parse_sitemap_advanced(): """Parse sitemap with full metadata""" sitemap_xml = get_sitemap() root = ET.fromstring(sitemap_xml) # Define namespace ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} urls = [] for url in root.findall('.//ns:url', ns): loc = url.find('ns:loc', ns) lastmod = url.find('ns:lastmod', ns) changefreq = url.find('ns:changefreq', ns) priority = url.find('ns:priority', ns) urls.append({ 'url': loc.text if loc is not None else None, 'lastmod': lastmod.text if lastmod is not None else None, 'changefreq': changefreq.text if changefreq is not None else None, 'priority': float(priority.text) if priority is not None else 0.5 }) return urls # Get all URL metadata sitemap_data = parse_sitemap_advanced() # Filter high-priority pages high_priority = [u for u in sitemap_data if u['priority'] >= 0.9] print(f"High priority pages: {len(high_priority)}") # Filter recently updated for url in sitemap_data: if url['lastmod']: print(f"{url['url']}: Last modified {url['lastmod']}") ``` ### Example 14: Async-Style Processing Process pages concurrently (using threading): ```python from src.main import get_page from concurrent.futures import ThreadPoolExecutor, as_completed import time def fetch_pages_concurrent(paths, user_id_prefix="user", max_workers=5): """Fetch multiple pages concurrently""" results = {} def fetch_one(index, path): # Use different user_id for each thread to avoid rate limits user_id = f"{user_id_prefix}_{index}" return path, get_page(path, user_id=user_id) with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks futures = { executor.submit(fetch_one, i, path): path for i, path in enumerate(paths) } # Collect results for future in as_completed(futures): path, result = future.result() results[path] = result if "error" in result: print(f"✗ {path}: {result['message']}") else: print(f"✓ {path}: {result['status_code']}") return results # Usage from src.main import list_pages pages = list_pages() all_results = fetch_pages_concurrent(pages[:5]) ``` This documentation provides comprehensive examples for using the Webpage MCP Server effectively!

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brian-bfz/fireworks4'

If you have feedback or need assistance with the MCP directory API, please join our Discord server