EXAMPLES.md•12.9 kB
# Usage Examples
Practical examples for using the Webpage MCP Server.
## Table of Contents
- [Basic Usage](#basic-usage)
- [Rate Limiting Examples](#rate-limiting-examples)
- [Integration Examples](#integration-examples)
- [Common Patterns](#common-patterns)
---
## Basic Usage
### Example 1: List All Pages
Get all available pages from the sitemap:
```python
from src.main import list_pages
# Get all pages
pages = list_pages()
print(f"Found {len(pages)} pages")
for page in pages:
print(f" - {page}")
# Output:
# Found 5 pages
# - /
# - /blog
# - /blog/yc-ankit-gupta-interview
# - /marketplace
# - /pricing
```
### Example 2: Fetch Homepage HTML
Retrieve the HTML content of the homepage:
```python
from src.main import get_page
# Fetch homepage
result = get_page("/")
if "error" not in result:
print(f"URL: {result['url']}")
print(f"Status: {result['status_code']}")
print(f"Content-Type: {result['content_type']}")
print(f"\nHTML Preview:")
print(result['html'][:200] + "...")
else:
print(f"Error: {result['message']}")
# Output:
# URL: https://example.com/
# Status: 200
# Content-Type: text/html
#
# HTML Preview:
# <!DOCTYPE html>
# <html>
# <head>
# <title>Example Domain</title>
# ...
```
### Example 3: Access Sitemap Resource
Get the raw sitemap XML:
```python
from src.main import get_sitemap
sitemap_content = get_sitemap()
print(sitemap_content)
# Output:
# <?xml version="1.0" encoding="UTF-8"?>
# <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
# <url>
# <loc>https://example.com/</loc>
# ...
# </url>
# </urlset>
```
---
## Rate Limiting Examples
### Example 4: Handle Rate Limits
Properly handle rate limit errors:
```python
from src.main import get_page
import time
def fetch_with_retry(path, user_id="default", max_retries=3):
"""Fetch page with automatic retry on rate limit"""
for attempt in range(max_retries):
result = get_page(path, user_id=user_id)
if "error" in result and result["error"] == "Rate limit exceeded":
wait_time = result["reset_in_seconds"]
print(f"Rate limited. Waiting {wait_time} seconds...")
time.sleep(wait_time)
continue
return result
raise Exception("Max retries exceeded")
# Usage
page = fetch_with_retry("/blog", user_id="user123")
print(page["html"])
```
### Example 5: Multi-User Rate Limiting
Different users have separate rate limits:
```python
from src.main import get_page
# User 1 makes requests
for i in range(5):
result = get_page("/", user_id="user1")
print(f"User1 request {i+1}: {result.get('status_code', 'rate limited')}")
# User 2 has their own limit
for i in range(5):
result = get_page("/", user_id="user2")
print(f"User2 request {i+1}: {result.get('status_code', 'rate limited')}")
# Both users can make 10 requests each
```
---
## Integration Examples
### Example 6: Parse Sitemap and Fetch All Pages
Fetch HTML for all pages in the sitemap:
```python
from src.main import list_pages, get_page
import time
def fetch_all_pages(delay=0.5):
"""Fetch all pages with delay between requests"""
pages = list_pages()
results = {}
for page in pages:
print(f"Fetching {page}...")
result = get_page(page)
if "error" in result:
print(f" Error: {result['message']}")
results[page] = None
else:
print(f" Success: {len(result['html'])} bytes")
results[page] = result
time.sleep(delay) # Avoid rate limiting
return results
# Fetch all pages
all_pages = fetch_all_pages()
print(f"\nSuccessfully fetched {sum(1 for v in all_pages.values() if v)} pages")
```
### Example 7: Extract Specific Data from Pages
Extract titles from all blog posts:
```python
from src.main import list_pages, get_page
from bs4 import BeautifulSoup
import re
def get_blog_titles():
"""Extract titles from blog posts"""
pages = list_pages()
blog_posts = [p for p in pages if p.startswith('/blog/')]
titles = {}
for post in blog_posts:
result = get_page(post)
if "error" not in result:
soup = BeautifulSoup(result['html'], 'html.parser')
title = soup.find('title')
if title:
titles[post] = title.get_text()
return titles
# Get all blog titles
titles = get_blog_titles()
for path, title in titles.items():
print(f"{path}: {title}")
```
### Example 8: Content Analysis
Analyze content across multiple pages:
```python
from src.main import list_pages, get_page
from collections import Counter
import re
def analyze_content():
"""Analyze word frequency across all pages"""
pages = list_pages()
all_words = []
for page in pages:
result = get_page(page)
if "error" not in result:
# Simple word extraction (remove HTML tags first)
text = re.sub(r'<[^>]+>', '', result['html'])
words = re.findall(r'\b\w+\b', text.lower())
all_words.extend(words)
# Get most common words
word_freq = Counter(all_words)
return word_freq.most_common(20)
# Analyze content
top_words = analyze_content()
print("Top 20 words:")
for word, count in top_words:
print(f" {word}: {count}")
```
---
## Common Patterns
### Example 9: Caching Pages Locally
Cache fetched pages to avoid repeated requests:
```python
from src.main import get_page
from pathlib import Path
import json
import hashlib
class PageCache:
def __init__(self, cache_dir="./cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _get_cache_path(self, path):
# Create filename from path hash
hash_key = hashlib.md5(path.encode()).hexdigest()
return self.cache_dir / f"{hash_key}.json"
def get(self, path, user_id=None):
cache_path = self._get_cache_path(path)
# Check cache
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
# Fetch and cache
result = get_page(path, user_id)
if "error" not in result:
with open(cache_path, 'w') as f:
json.dump(result, f)
return result
# Usage
cache = PageCache()
page = cache.get("/blog") # Fetches from server
page = cache.get("/blog") # Returns from cache
```
### Example 10: Batch Processing with Progress
Process multiple pages with progress tracking:
```python
from src.main import list_pages, get_page
from tqdm import tqdm
import time
def batch_process_pages(processor_func, delay=0.5):
"""Process pages in batches with progress bar"""
pages = list_pages()
results = []
for page in tqdm(pages, desc="Processing pages"):
result = get_page(page)
if "error" not in result:
processed = processor_func(result)
results.append({
'path': page,
'data': processed
})
else:
tqdm.write(f"Error on {page}: {result['message']}")
time.sleep(delay)
return results
# Example processor
def extract_metadata(page_result):
return {
'url': page_result['url'],
'size': len(page_result['html']),
'content_type': page_result['content_type']
}
# Process all pages
metadata = batch_process_pages(extract_metadata)
```
### Example 11: Error Handling Pattern
Robust error handling for production use:
```python
from src.main import get_page
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def safe_get_page(path, user_id=None, max_retries=3):
"""Safely fetch page with comprehensive error handling"""
for attempt in range(max_retries):
try:
result = get_page(path, user_id)
# Handle rate limit
if "error" in result:
if result["error"] == "Rate limit exceeded":
wait_time = result["reset_in_seconds"]
logger.warning(f"Rate limited. Retry {attempt+1}/{max_retries}")
time.sleep(wait_time)
continue
else:
logger.error(f"Error fetching {path}: {result['message']}")
return None
logger.info(f"Successfully fetched {path}")
return result
except Exception as e:
logger.error(f"Unexpected error on attempt {attempt+1}: {e}")
if attempt == max_retries - 1:
raise
time.sleep(1)
return None
# Usage
page = safe_get_page("/blog/post-1", user_id="user123")
if page:
print(f"Got {len(page['html'])} bytes")
```
### Example 12: Testing Server Responses
Test suite for validating server responses:
```python
from src.main import list_pages, get_page, get_sitemap
def test_server():
"""Test all server functionality"""
# Test 1: List pages
print("Test 1: Listing pages...")
pages = list_pages()
assert len(pages) > 0, "Should have pages"
assert all(p.startswith('/') for p in pages), "All paths should start with /"
print(f"✓ Found {len(pages)} pages")
# Test 2: Get sitemap
print("\nTest 2: Getting sitemap...")
sitemap = get_sitemap()
assert '<?xml' in sitemap, "Should be valid XML"
assert 'urlset' in sitemap, "Should contain urlset"
print(f"✓ Sitemap is {len(sitemap)} bytes")
# Test 3: Fetch page
print("\nTest 3: Fetching page...")
result = get_page(pages[0])
assert "error" not in result, "Should fetch successfully"
assert result['status_code'] == 200, "Should return 200"
assert len(result['html']) > 0, "Should have HTML content"
print(f"✓ Page fetched: {result['url']}")
# Test 4: Rate limiting
print("\nTest 4: Testing rate limit...")
for i in range(12):
result = get_page("/", user_id="test")
if "error" in result and result["error"] == "Rate limit exceeded":
print(f"✓ Rate limit triggered at request {i+1}")
break
print("\n✅ All tests passed!")
# Run tests
test_server()
```
---
## Advanced Usage
### Example 13: Sitemap Parser
Custom sitemap parser with filtering:
```python
from src.main import get_sitemap
import xml.etree.ElementTree as ET
from datetime import datetime
def parse_sitemap_advanced():
"""Parse sitemap with full metadata"""
sitemap_xml = get_sitemap()
root = ET.fromstring(sitemap_xml)
# Define namespace
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = []
for url in root.findall('.//ns:url', ns):
loc = url.find('ns:loc', ns)
lastmod = url.find('ns:lastmod', ns)
changefreq = url.find('ns:changefreq', ns)
priority = url.find('ns:priority', ns)
urls.append({
'url': loc.text if loc is not None else None,
'lastmod': lastmod.text if lastmod is not None else None,
'changefreq': changefreq.text if changefreq is not None else None,
'priority': float(priority.text) if priority is not None else 0.5
})
return urls
# Get all URL metadata
sitemap_data = parse_sitemap_advanced()
# Filter high-priority pages
high_priority = [u for u in sitemap_data if u['priority'] >= 0.9]
print(f"High priority pages: {len(high_priority)}")
# Filter recently updated
for url in sitemap_data:
if url['lastmod']:
print(f"{url['url']}: Last modified {url['lastmod']}")
```
### Example 14: Async-Style Processing
Process pages concurrently (using threading):
```python
from src.main import get_page
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
def fetch_pages_concurrent(paths, user_id_prefix="user", max_workers=5):
"""Fetch multiple pages concurrently"""
results = {}
def fetch_one(index, path):
# Use different user_id for each thread to avoid rate limits
user_id = f"{user_id_prefix}_{index}"
return path, get_page(path, user_id=user_id)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
futures = {
executor.submit(fetch_one, i, path): path
for i, path in enumerate(paths)
}
# Collect results
for future in as_completed(futures):
path, result = future.result()
results[path] = result
if "error" in result:
print(f"✗ {path}: {result['message']}")
else:
print(f"✓ {path}: {result['status_code']}")
return results
# Usage
from src.main import list_pages
pages = list_pages()
all_results = fetch_pages_concurrent(pages[:5])
```
This documentation provides comprehensive examples for using the Webpage MCP Server effectively!