MCP Web Tools Server

  • tools
""" Crawl4AI web scraping tool for MCP server. This module provides advanced web scraping functionality using Crawl4AI. It extracts content from web pages, removes non-essential elements like navigation bars, footers, and sidebars, and returns well-formatted markdown that preserves document structure including headings, code blocks, tables, and image references. Features: - Clean content extraction with navigation, sidebar, and footer removal - Preserves document structure (headings, lists, tables, code blocks) - Automatic conversion to well-formatted markdown - Support for JavaScript-rendered content - Content filtering to focus on the main article/content - Comprehensive error handling """ import asyncio import os import re import logging from typing import Optional from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger("crawl4ai_scraper") async def crawl_and_extract_markdown(url: str, query: Optional[str] = None) -> str: """ Crawl a webpage and extract well-formatted markdown content. Args: url: The URL to crawl query: Optional search query to focus content on (if None, extracts main content) Returns: str: Well-formatted markdown content from the webpage Raises: Exception: If crawling fails or content extraction encounters errors """ try: # Configure the browser for optimal rendering browser_config = BrowserConfig( headless=True, viewport_width=1920, # Wider viewport to capture more content viewport_height=1080, # Taller viewport for the same reason java_script_enabled=True, text_mode=False, # Set to False to ensure all content is loaded user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) # Create a content filter for removing unwanted elements content_filter = PruningContentFilter( threshold=0.1, # Very low threshold to keep more content threshold_type="dynamic", # Dynamic threshold based on page content min_word_threshold=2 # Include very short text blocks for headings/code ) # Configure markdown generator with options for structure preservation markdown_generator = DefaultMarkdownGenerator( content_filter=content_filter, options={ "body_width": 0, # No wrapping "ignore_images": False, # Keep image references "citations": True, # Include link citations "escape_html": False, # Don't escape HTML in code blocks "include_sup_sub": True, # Preserve superscript/subscript "pad_tables": True, # Better table formatting "mark_code": True, # Better code block preservation "code_language": "", # Default code language "wrap_links": False # Preserve link formatting } ) # Configure the crawler run for optimal structure extraction run_config = CrawlerRunConfig( verbose=False, # Content filtering markdown_generator=markdown_generator, word_count_threshold=2, # Extremely low to include very short text blocks # Tag exclusions - remove unwanted elements excluded_tags=["nav", "footer", "aside"], excluded_selector=".nav, .navbar, .sidebar, .footer, #footer, #sidebar, " + ".ads, .advertisement, .navigation, #navigation, " + ".menu, #menu, .toc, .table-of-contents", # Wait conditions for JS content wait_until="networkidle", wait_for="css:pre, code, h1, h2, h3, table", # Wait for important structural elements page_timeout=60000, # Don't limit to specific selectors to get full content css_selector=None, # Other options remove_overlay_elements=True, # Remove modal popups remove_forms=True, # Remove forms scan_full_page=True, # Scan the full page scroll_delay=0.5, # Slower scroll for better content loading cache_mode=CacheMode.BYPASS # Bypass cache for fresh content ) # Create crawler and run it async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) if not result.success: raise Exception(f"Crawl failed: {result.error_message}") # Extract the title from metadata if available title = "Untitled Document" if result.metadata and "title" in result.metadata: title = result.metadata["title"] # Choose the best markdown content markdown_content = "" # Try to get the best version of the markdown if hasattr(result, "markdown_v2") and result.markdown_v2: if hasattr(result.markdown_v2, 'raw_markdown') and result.markdown_v2.raw_markdown: markdown_content = result.markdown_v2.raw_markdown elif hasattr(result.markdown_v2, 'markdown_with_citations') and result.markdown_v2.markdown_with_citations: markdown_content = result.markdown_v2.markdown_with_citations elif hasattr(result, "markdown") and result.markdown: if isinstance(result.markdown, str): markdown_content = result.markdown elif hasattr(result.markdown, 'raw_markdown'): markdown_content = result.markdown.raw_markdown elif result.cleaned_html: from html2text import html2text markdown_content = html2text(result.cleaned_html) # Post-process the markdown to fix common issues # 1. Fix code blocks - ensure they have proper formatting markdown_content = re.sub(r'```\s*\n', '```python\n', markdown_content) # 2. Fix broken headings - ensure space after # characters markdown_content = re.sub(r'^(#{1,6})([^#\s])', r'\1 \2', markdown_content, flags=re.MULTILINE) # 3. Add spacing between sections for readability markdown_content = re.sub(r'(\n#{1,6} .+?\n)(?=[^\n])', r'\1\n', markdown_content) # 4. Fix bullet points - ensure proper spacing markdown_content = re.sub(r'^\*([^\s])', r'* \1', markdown_content, flags=re.MULTILINE) # 5. Format the final content with title and URL final_content = f"Title: {title}\n\nURL Source: {result.url}\n\nMarkdown Content:\n{markdown_content}" return final_content except Exception as e: logger.error(f"Error crawling {url}: {str(e)}") raise Exception(f"Error crawling {url}: {str(e)}") # Standalone test functionality if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Extract structured markdown content from a webpage") parser.add_argument("url", nargs="?", default="https://docs.llamaindex.ai/en/stable/understanding/agent/", help="URL to crawl (default: https://docs.llamaindex.ai/en/stable/understanding/agent/)") parser.add_argument("--output", help="Output file to save the markdown (default: scraped_content.md)") parser.add_argument("--query", help="Optional search query to focus content") args = parser.parse_args() async def test(): url = args.url print(f"Scraping {url}...") try: if args.query: result = await crawl_and_extract_markdown(url, args.query) else: result = await crawl_and_extract_markdown(url) # Show preview of content preview_length = min(1000, len(result)) print("\nResult Preview (first 1000 chars):") print(result[:preview_length] + "...\n" if len(result) > preview_length else result) # Print statistics print(f"\nMarkdown length: {len(result)} characters") # Save to file output_file = args.output if args.output else "scraped_content.md" with open(output_file, "w", encoding="utf-8") as f: f.write(result) print(f"Full content saved to '{output_file}'") return 0 except Exception as e: print(f"Error: {str(e)}") return 1 # Run the test function in an async event loop exit_code = asyncio.run(test()) import sys sys.exit(exit_code)