main.py•14.5 kB
"""
Webpage Server MCP Server
A Model Context Protocol server for querying webpages and page contents
"""
import os
import sys
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, Any, List
from collections import defaultdict
import time
from dotenv import load_dotenv
from urllib.parse import urljoin, urlparse
from openai import ChatOpenAI
from browser_use import Agent, Browser
import asyncio
from mcp.server.fastmcp import FastMCP
# Load environment variables
env_path = Path('.') / '.env.local'
if env_path.exists():
load_dotenv(env_path)
load_dotenv() # Also load .env if exists
# Get port from environment or command line args
port = int(os.environ.get('PORT', '8080'))
for i, arg in enumerate(sys.argv):
if arg == '--port' and i + 1 < len(sys.argv):
port = int(sys.argv[i + 1])
break
# Get host from environment or command line args
host = os.environ.get('HOST', '0.0.0.0')
for i, arg in enumerate(sys.argv):
if arg == '--host' and i + 1 < len(sys.argv):
host = sys.argv[i + 1]
break
mcp = FastMCP(
name='Webpage Server',
host=host,
port=port,
instructions="""This MCP server queries webpages and page contents of a specific website.
Available tools:
- list_pages(): List the path of all webpages
Example: list_pages() -> ["/", "/blog", "/blog/post-1", "/marketplace"]
Parse the sitemap.xml to get the list of pages.
Return the page paths instead of the full url.
- get_page(path): Get the html content of a webpage
Example: get_page("/blog/post-1") -> "<html>...</html>"
Get the html content by visiting the full url.
Resources:
- sitemap.xml: The sitemap of the website
This server includes rate limiting (10 requests/minute) to protect API keys.""",
)
# Configuration
BASE_URL = os.getenv('BASE_URL', 'https://example.com')
SITEMAP_PATH = Path(__file__).parent.parent / 'assets' / 'sitemap.xml'
# Rate limiting for API protection
class RateLimiter:
"""Simple rate limiter to protect API keys from abuse"""
def __init__(self, max_requests: int = 10, window_seconds: int = 60):
self.max_requests = max_requests
self.window_seconds = window_seconds
self.requests = defaultdict(list)
def is_allowed(self, identifier: str) -> bool:
"""Check if request is allowed for this identifier"""
now = time.time()
# Clean old requests outside window
self.requests[identifier] = [
req_time
for req_time in self.requests[identifier]
if now - req_time < self.window_seconds
]
# Check if under limit
if len(self.requests[identifier]) < self.max_requests:
self.requests[identifier].append(now)
return True
return False
def get_reset_time(self, identifier: str) -> int:
"""Get seconds until rate limit resets"""
if not self.requests[identifier]:
return 0
oldest = min(self.requests[identifier])
return max(0, int(self.window_seconds - (time.time() - oldest)))
# Initialize rate limiter (10 requests per minute)
rate_limiter = RateLimiter(max_requests=10, window_seconds=60)
def fetch_sitemap() -> str:
"""Read sitemap.xml content from local file"""
if not SITEMAP_PATH.exists():
raise ValueError(f"Sitemap file not found at {SITEMAP_PATH}")
return SITEMAP_PATH.read_text()
def parse_sitemap_urls(sitemap_xml: str) -> List[str]:
"""Parse sitemap XML and extract URLs"""
try:
root = ET.fromstring(sitemap_xml)
# Handle namespace
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# Try with namespace first
urls = root.findall('.//ns:url/ns:loc', namespace)
if not urls:
# Try without namespace
urls = root.findall('.//url/loc')
return [url.text for url in urls if url.text]
except Exception as e:
raise ValueError(f"Failed to parse sitemap: {str(e)}")
@mcp.resource('sitemap://sitemap.xml')
def get_sitemap() -> str:
"""
Serve the sitemap.xml file
Returns:
Content of the sitemap.xml file
"""
return fetch_sitemap()
@mcp.tool()
def list_pages() -> List[str]:
"""
List the path of all webpages from sitemap.xml
Returns:
List of page paths (e.g., ["/", "/blog", "/blog/post-1"])
"""
sitemap_content = fetch_sitemap()
urls = parse_sitemap_urls(sitemap_content)
# Convert full URLs to paths
paths = []
for url in urls:
parsed = urlparse(url)
path = parsed.path or '/'
paths.append(path)
return sorted(set(paths)) # Remove duplicates and sort
@mcp.tool()
async def get_hackernews_top_story(include_comments: bool = False) -> Dict[str, Any]:
"""
Get the current top-ranked Hacker News story with full metadata and summary
Args:
include_comments: Whether to include a summary of top discussion comments
Returns:
Dictionary with story metadata, content summary, and optionally comments summary
"""
prompt = """
Objective
- Produce a single structured result describing the current top-ranked Hacker News story
(rank 1) from https://news.ycombinator.com. Include full metadata, a concise neutral summary
of the story's content, and, when include_comments is true, a brief synthesis of the most
relevant top-level discussion comments. Handle external link articles, self-posts (Ask
HN/Show HN/Tell HN), and error conditions.
Input
- include_comments (bool): Controls whether to include a comments_summary based on the item's
discussion thread.
What to identify and extract
1) Determine the top-ranked story
- Identify the story ranked #1 on the Hacker News front page.
- Extract its Hacker News item id (numeric) from the item/discussion link or metadata present
for that story.
2) Post metadata fields (required)
- id (int): The HN item id for the top-ranked story.
- title (str): The story title as shown on Hacker News.
- source_url (str): The external article URL if the post is a link post; for self-posts
(Ask/Show/Tell HN) with no external article, set to an empty string.
- discussion_url (str): The canonical Hacker News discussion thread URL for this item (e.g.,
https://news.ycombinator.com/item?id={id}).
- by (str): The submitter's username.
- points (int): The story points. If not displayed, set to 0.
- comments_count (int): The number of comments. If the thread shows "discuss" or no count,
set to 0.
- posted_at (str ISO 8601): The story's submission timestamp in ISO 8601 UTC format (e.g.,
2025-10-11T14:32:05Z).
- Prefer an absolute timestamp if available (e.g., a title/attribute containing an ISO-like
datetime on the HN page or discussion page).
- If only relative time ("X hours ago") is available, convert to an absolute ISO 8601 UTC
timestamp using the current time.
- Ensure the final value is an ISO 8601 string; do not return a relative string.
3) Content source determination
- If the item is a link post:
- Use the external article URL as the content source.
- Follow HTTP redirects to the final destination before extraction.
- Normalize URL encoding and strip tracking parameters only if clearly non-essential (e.g.,
utm_).
- If the item is a self-post (Ask HN/Show HN/Tell HN or other self text):
- Use the text content from the HN item/discussion page as the content source.
4) Article/content extraction rules (for link posts)
- Extract the main article text using a readability-like approach:
- Focus on the primary content region; exclude navigation, ads, footers, bylines beyond the
first mention, comments, and unrelated widgets.
- Remove scripts, styles, inline ads, share widgets, cookie banners, and repetitive
boilerplate.
- Normalize whitespace, resolve HTML entities, and ensure UTF-8 text.
- If the main content cannot be reliably extracted, the page is empty, or the content is
paywalled:
- Fallback to summarizing the title plus any available snippet from the page's metadata
(e.g., meta description, og:description, twitter:description).
- If no snippet is available, summarize based on the HN title alone.
5) Discussion comments collection (only when include_comments is true)
- Use the HN discussion page for the item.
- Collect a small set of top-level comments that are most representative:
- Prioritize the top-visible top-level comments as they appear by default (aim for
approximately 5–8 if available).
- Exclude off-topic content, low-signal remarks, and tangential debates.
- Clean comment text by removing quoted blocks, code blocks, link lists, boilerplate
signatures, and nested citations.
- If a comment is collapsed or deleted, ignore it.
Summarization requirements
- Summary (for article/self-post content):
- Concise, neutral, and factual.
- Capture the core topic, key points, and any important findings, results, or implications.
- Avoid personal opinions and extraneous context.
- Length guideline: roughly 3–6 sentences (about 80–180 words).
- Comments summary (only when include_comments is true):
- Synthesize the main consensus points and notable dissent across the selected top-level
comments.
- 2–4 sentences, neutral and balanced.
- Do not include usernames or direct quotes; do not elevate outlier takes unless they are a
significant dissenting viewpoint.
Normalization and data quality requirements
- Trim all strings; collapse repeated whitespace.
- Ensure URLs are absolute (https preferred if available).
- Ensure numeric fields (id, points, comments_count) are integers.
- Ensure posted_at is a valid ISO 8601 UTC string with a trailing Z.
- If the post has no external article (self-post), source_url must be an empty string.
- If comments are disabled or not present, comments_count is 0. Do not include
comments_summary unless include_comments is true.
Error handling
- If the Hacker News front page cannot be loaded, the top story cannot be determined, or the
item metadata cannot be parsed: return a clear error with type and message.
- If the external article fails to load, times out, is blocked, or content extraction
fails/paywalled: proceed with the fallback summarization as described; only error if both
primary and fallback sources fail to produce any content to summarize.
- If absolutely no top-ranked story can be found: return a not-found error.
- Error format:
- { "error": { "type": "<ErrorType>", "message": "<human-readable message>", "context": {
...optional details... } } }
- Use specific types when possible (e.g., NetworkError, TimeoutError, ParseError,
NotFoundError, ExtractionError, PaywallError).
"""
api_key = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-4o", api_key=api_key, temperature=0)
# Concatenate inputs to task
full_task = prompt
browser = Browser(headless=False)
try:
await browser.start()
agent = Agent(task=full_task, browser=browser, llm=llm)
result = await asyncio.wait_for(agent.run(max_steps=15), timeout=180)
return result
except asyncio.TimeoutError:
return "TIMEOUT: Browser agent exceeded 180 seconds"
except Exception as e:
return f"ERROR: {str(e)}"
finally:
await browser.stop()
@mcp.tool()
def get_page(path: str, user_id: str = None) -> Dict[str, Any]:
"""
Get the HTML content of a webpage
Args:
path: Path to the webpage (e.g., "/blog/post-1")
user_id: Optional user identifier for rate limiting
Returns:
Dictionary with HTML content and metadata
"""
# Rate limiting check
identifier = user_id or 'default'
if not rate_limiter.is_allowed(identifier):
reset_time = rate_limiter.get_reset_time(identifier)
return {
'error': 'Rate limit exceeded',
'message': f'Too many requests. Please wait {reset_time} seconds before trying again.',
'reset_in_seconds': reset_time,
'limit': '10 requests per minute',
}
# Construct full URL
if not path.startswith('/'):
path = '/' + path
full_url = urljoin(BASE_URL, path)
try:
import requests
response = requests.get(full_url, timeout=10)
response.raise_for_status()
return {
'path': path,
'url': full_url,
'html': response.text,
'status_code': response.status_code,
'content_type': response.headers.get('Content-Type', 'unknown'),
}
except Exception as e:
return {
'error': 'Failed to fetch page',
'path': path,
'url': full_url,
'message': str(e),
}
def main():
"""Main entry point for the MCP server"""
import argparse
# Parse command line arguments
parser = argparse.ArgumentParser(description='Webpage Server MCP Server')
parser.add_argument('--port', type=int, help='Port for HTTP transport')
parser.add_argument(
'--host', type=str, default='0.0.0.0', help='Host for HTTP transport'
)
parser.add_argument('--stdio', action='store_true', help='Force STDIO transport')
parser.add_argument('--test', action='store_true', help='Test mode')
args = parser.parse_args()
# Check if running in test mode
if args.test:
# Test mode - just verify everything loads
print('Webpage Server MCP Server loaded successfully')
print(f'Base URL: {BASE_URL}')
print(f'Sitemap Path: {SITEMAP_PATH}')
print('Tools available: list_pages, get_page')
print('Resources available: sitemap.xml')
return 0
# Determine transport mode
if (args.port or os.environ.get('PORT')) and not args.stdio:
# HTTP transport mode
actual_host = host if not args.host else args.host
actual_port = port if not args.port else args.port
print(f'Starting HTTP server on {actual_host}:{actual_port}')
print(f'MCP endpoint: http://{actual_host}:{actual_port}/mcp')
mcp.run(transport='streamable-http')
else:
# STDIO transport (default for MCP)
mcp.run('stdio')
return 0
if __name__ == '__main__':
import sys
sys.exit(main())