Agent Construct
by batteryshark
- agent_construct
- tools
"""
URL Scraper Tool - Converts webpage content to markdown, with support for JavaScript-rendered content
"""
import re
import json
import logging
import asyncio
import requests
import urllib3
from typing import Dict, Any, List
from markdownify import markdownify
from requests.exceptions import RequestException
from http.client import HTTPException
from pydantic import BaseModel, Field
from mcp_server.utils.tool_decorator import mcp_tool
from mcp import types as mcp_types
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
# Configure logging
logger = logging.getLogger(__name__)
# Define tool name as a constant
TOOL_NAME = "url_scraper"
class URLScraperInput(BaseModel):
"""Input model for URL scraping."""
url: str = Field(description="The URL of the webpage to scrape")
render_js: bool = Field(
default=False,
description="Whether to render JavaScript before scraping (slower but more accurate for dynamic content)"
)
async def scrape_with_playwright(url: str, config: Dict[str, Any]) -> Dict:
"""Scrape content using Playwright with JavaScript rendering."""
try:
async with async_playwright() as p:
# Launch browser with custom user agent
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent=config["user_agent"],
viewport={'width': 1920, 'height': 1080} # Set a standard viewport
)
# Create new page and navigate
page = await context.new_page()
try:
# First try with load event
await page.goto(
url,
wait_until="load",
timeout=config["timeout"] * 1000
)
except PlaywrightTimeout:
logger.warning("Initial page load timed out, trying with domcontentloaded")
# If that fails, try with just DOM content loaded
await page.goto(
url,
wait_until="domcontentloaded",
timeout=config["timeout"] * 1000
)
try:
# Wait for the page to become mostly stable
await page.wait_for_load_state("networkidle", timeout=5000)
except PlaywrightTimeout:
logger.warning("Network idle wait timed out, proceeding with current state")
# Additional wait for any lazy-loaded content
try:
# Scroll to bottom to trigger lazy loading
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(2) # Brief pause for content to load
except Exception as e:
logger.warning(f"Scroll attempt failed: {e}")
# Get the rendered HTML
content = await page.content()
# Convert to markdown
markdown_content = markdownify(content).strip()
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
await browser.close()
return {
"status": "success",
"content": markdown_content
}
except Exception as e:
logger.error(f"Error scraping with Playwright: {str(e)}")
return {
"status": "error",
"error": f"Error scraping with Playwright: {str(e)}"
}
def scrape_with_urllib3(url: str, config: Dict[str, Any]) -> Dict:
"""Fallback scraping implementation using urllib3 for cases where requests fails."""
try:
http = urllib3.PoolManager(
headers={"User-Agent": config["user_agent"]},
timeout=urllib3.Timeout(connect=config["timeout"], read=config["timeout"])
)
response = http.request("GET", url)
if response.status >= 400:
raise urllib3.exceptions.HTTPError(f"HTTP {response.status}")
markdown_content = markdownify(response.data.decode('utf-8')).strip()
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
return {
"status": "success",
"content": markdown_content
}
except Exception as e:
logger.error(f"Error fetching webpage with urllib3: {str(e)}")
return {
"status": "error",
"error": f"Error fetching webpage with urllib3: {str(e)}"
}
def scrape_with_requests(url: str, config: Dict[str, Any]) -> Dict:
"""Simple scraping implementation using requests."""
try:
headers = {"User-Agent": config["user_agent"]}
response = requests.get(url, headers=headers, timeout=config["timeout"])
response.raise_for_status()
markdown_content = markdownify(response.text).strip()
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
return {
"status": "success",
"content": markdown_content
}
except (HTTPException, RequestException) as e:
# If we hit header limits or other request issues, try urllib3
logger.warning(f"Requests failed, falling back to urllib3: {str(e)}")
return scrape_with_urllib3(url, config)
except Exception as e:
logger.error(f"Unexpected error: {str(e)}")
return {
"status": "error",
"error": f"An unexpected error occurred: {str(e)}"
}
@mcp_tool(
name=TOOL_NAME,
description="Scrapes a webpage and returns its content as markdown, with optional JavaScript rendering support",
input_model=URLScraperInput,
required_env_vars=[],
config_defaults={
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"timeout": 30 # Increased timeout for JS rendering
},
rate_limit=50,
rate_limit_window=60
)
async def scrape_url(url: str, render_js: bool = False, config: Dict[str, Any] = None) -> Dict:
"""Scrape content from a URL and convert it to markdown.
Args:
url: The URL to scrape
render_js: Whether to render JavaScript before scraping
config: Configuration dictionary containing user_agent and timeout settings
Returns:
Dictionary containing the scraped content and metadata
"""
if config is None:
config = {}
# Ensure we have default config values
default_config = {
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"timeout": 30
}
config = {**default_config, **config}
if render_js:
return await scrape_with_playwright(url, config)
else:
return scrape_with_requests(url, config)