MCP Web Tools Server

MIT License
OverviewInspectSchema Related Servers Reviews Score
tools
"""
Crawl4AI web scraping tool for MCP server.

This module provides advanced web scraping functionality using Crawl4AI.
It extracts content from web pages, removes non-essential elements like
navigation bars, footers, and sidebars, and returns well-formatted markdown
that preserves document structure including headings, code blocks, tables,
and image references.

Features:
- Clean content extraction with navigation, sidebar, and footer removal
- Preserves document structure (headings, lists, tables, code blocks)
- Automatic conversion to well-formatted markdown
- Support for JavaScript-rendered content
- Content filtering to focus on the main article/content
- Comprehensive error handling
"""

import asyncio
import os
import re
import logging
from typing import Optional

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("crawl4ai_scraper")

async def crawl_and_extract_markdown(url: str, query: Optional[str] = None) -> str:
    """
    Crawl a webpage and extract well-formatted markdown content.
    
    Args:
        url: The URL to crawl
        query: Optional search query to focus content on (if None, extracts main content)
    
    Returns:
        str: Well-formatted markdown content from the webpage
    
    Raises:
        Exception: If crawling fails or content extraction encounters errors
    """
    try:
        # Configure the browser for optimal rendering
        browser_config = BrowserConfig(
            headless=True,
            viewport_width=1920,  # Wider viewport to capture more content
            viewport_height=1080,  # Taller viewport for the same reason
            java_script_enabled=True,
            text_mode=False,  # Set to False to ensure all content is loaded
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        
        # Create a content filter for removing unwanted elements
        content_filter = PruningContentFilter(
            threshold=0.1,  # Very low threshold to keep more content
            threshold_type="dynamic",  # Dynamic threshold based on page content
            min_word_threshold=2  # Include very short text blocks for headings/code
        )
        
        # Configure markdown generator with options for structure preservation
        markdown_generator = DefaultMarkdownGenerator(
            content_filter=content_filter,
            options={
                "body_width": 0,         # No wrapping
                "ignore_images": False,   # Keep image references
                "citations": True,        # Include link citations
                "escape_html": False,     # Don't escape HTML in code blocks
                "include_sup_sub": True,  # Preserve superscript/subscript
                "pad_tables": True,       # Better table formatting
                "mark_code": True,        # Better code block preservation
                "code_language": "",      # Default code language
                "wrap_links": False       # Preserve link formatting
            }
        )
        
        # Configure the crawler run for optimal structure extraction
        run_config = CrawlerRunConfig(
            verbose=False,
            # Content filtering
            markdown_generator=markdown_generator,
            word_count_threshold=2,  # Extremely low to include very short text blocks
            
            # Tag exclusions - remove unwanted elements
            excluded_tags=["nav", "footer", "aside"],
            excluded_selector=".nav, .navbar, .sidebar, .footer, #footer, #sidebar, " +
                             ".ads, .advertisement, .navigation, #navigation, " +
                             ".menu, #menu, .toc, .table-of-contents",
            
            # Wait conditions for JS content
            wait_until="networkidle",
            wait_for="css:pre, code, h1, h2, h3, table",  # Wait for important structural elements 
            page_timeout=60000,
            
            # Don't limit to specific selectors to get full content
            css_selector=None,
            
            # Other options
            remove_overlay_elements=True,    # Remove modal popups
            remove_forms=True,               # Remove forms
            scan_full_page=True,             # Scan the full page
            scroll_delay=0.5,                # Slower scroll for better content loading
            cache_mode=CacheMode.BYPASS      # Bypass cache for fresh content
        )
        
        # Create crawler and run it
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(url=url, config=run_config)
            
            if not result.success:
                raise Exception(f"Crawl failed: {result.error_message}")
            
            # Extract the title from metadata if available
            title = "Untitled Document"
            if result.metadata and "title" in result.metadata:
                title = result.metadata["title"]
            
            # Choose the best markdown content
            markdown_content = ""
            
            # Try to get the best version of the markdown
            if hasattr(result, "markdown_v2") and result.markdown_v2:
                if hasattr(result.markdown_v2, 'raw_markdown') and result.markdown_v2.raw_markdown:
                    markdown_content = result.markdown_v2.raw_markdown
                elif hasattr(result.markdown_v2, 'markdown_with_citations') and result.markdown_v2.markdown_with_citations:
                    markdown_content = result.markdown_v2.markdown_with_citations
            elif hasattr(result, "markdown") and result.markdown:
                if isinstance(result.markdown, str):
                    markdown_content = result.markdown
                elif hasattr(result.markdown, 'raw_markdown'):
                    markdown_content = result.markdown.raw_markdown
            elif result.cleaned_html:
                from html2text import html2text
                markdown_content = html2text(result.cleaned_html)
            
            # Post-process the markdown to fix common issues
            
            # 1. Fix code blocks - ensure they have proper formatting
            markdown_content = re.sub(r'```\s*\n', '```python\n', markdown_content)
            
            # 2. Fix broken headings - ensure space after # characters
            markdown_content = re.sub(r'^(#{1,6})([^#\s])', r'\1 \2', markdown_content, flags=re.MULTILINE)
            
            # 3. Add spacing between sections for readability
            markdown_content = re.sub(r'(\n#{1,6} .+?\n)(?=[^\n])', r'\1\n', markdown_content)
            
            # 4. Fix bullet points - ensure proper spacing
            markdown_content = re.sub(r'^\*([^\s])', r'* \1', markdown_content, flags=re.MULTILINE)
            
            # 5. Format the final content with title and URL
            final_content = f"Title: {title}\n\nURL Source: {result.url}\n\nMarkdown Content:\n{markdown_content}"
            
            return final_content
                
    except Exception as e:
        logger.error(f"Error crawling {url}: {str(e)}")
        raise Exception(f"Error crawling {url}: {str(e)}")

# Standalone test functionality
if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Extract structured markdown content from a webpage")
    parser.add_argument("url", nargs="?", default="https://docs.llamaindex.ai/en/stable/understanding/agent/", 
                        help="URL to crawl (default: https://docs.llamaindex.ai/en/stable/understanding/agent/)")
    parser.add_argument("--output", help="Output file to save the markdown (default: scraped_content.md)")
    parser.add_argument("--query", help="Optional search query to focus content")
    
    args = parser.parse_args()
    
    async def test():
        url = args.url
        print(f"Scraping {url}...")
        
        try:
            if args.query:
                result = await crawl_and_extract_markdown(url, args.query)
            else:
                result = await crawl_and_extract_markdown(url)
            
            # Show preview of content
            preview_length = min(1000, len(result))
            print("\nResult Preview (first 1000 chars):")
            print(result[:preview_length] + "...\n" if len(result) > preview_length else result)
            
            # Print statistics
            print(f"\nMarkdown length: {len(result)} characters")
            
            # Save to file
            output_file = args.output if args.output else "scraped_content.md"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(result)
            print(f"Full content saved to '{output_file}'")
            
            return 0
        except Exception as e:
            print(f"Error: {str(e)}")
            return 1
    
    # Run the test function in an async event loop
    exit_code = asyncio.run(test())
    import sys
    sys.exit(exit_code)