MCP Windows Website Downloader Server

MIT License

Overview InspectNew Schema Related Servers Reviews Score

mcp-windows-website-downloader
src
mcp_windows_website_downloader

"""
Core website downloading functionality.
"""
import logging
import asyncio
from pathlib import Path
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import json
import re
from typing import Dict, Any, Optional, Set
from .utils import clean_filename

logger = logging.getLogger(__name__)

class WebsiteDownloader:
    """Downloads and processes documentation websites"""
    
    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
        logger.info(f"Downloader initialized with output directory: {self.output_dir}")
        if not self.output_dir.exists():
            self.output_dir.mkdir(parents=True)
            logger.info(f"Created output directory at {self.output_dir}")
        self.visited_urls = set()
        self.current_domain = None
        self.site_dir = None
        self.max_depth = 2  # Default, will be adjusted based on site analysis

    async def _analyze_site_structure(self, session: aiohttp.ClientSession, url: str) -> int:
        """
        Analyze the site structure to determine appropriate crawl depth.
        Returns recommended max depth.
        """
        try:
            logger.info("Analyzing site structure...")
            async with session.get(url) as response:
                if response.status != 200:
                    return self.max_depth
                    
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                # Look for common documentation patterns
                nav_elements = soup.find_all(['nav', 'sidebar', 'menu', 'toc'])
                has_nav = len(nav_elements) > 0
                
                # Check URL patterns
                path = urlparse(url).path
                is_docs_url = any(x in path.lower() for x in ['/docs/', '/documentation/', '/guide/', '/tutorial/'])
                
                # Check for documentation frameworks
                is_sphinx = bool(soup.find('div', {'class': 'sphinxsidebar'}))
                is_mkdocs = bool(soup.find('nav', {'class': 'md-nav'}))
                is_docusaurus = bool(soup.find('nav', {'class': 'menu'}))
                
                # Analyze link structure
                links = set()
                for a in soup.find_all('a', href=True):
                    href = a['href']
                    if not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
                        full_url = urljoin(url, href)
                        if urlparse(full_url).netloc == self.current_domain:
                            links.add(full_url)
                
                # Determine appropriate depth
                if is_sphinx or is_mkdocs or is_docusaurus:
                    # Known documentation sites usually need more depth
                    depth = 4
                elif is_docs_url and has_nav:
                    # Looks like structured documentation
                    depth = 3
                elif len(links) > 100:
                    # Large site, be conservative
                    depth = 2
                else:
                    # Small or unknown site structure
                    depth = 2
                    
                logger.info(f"Site analysis complete. Recommended depth: {depth}")
                return depth
                
        except Exception as e:
            logger.warning(f"Site analysis failed: {str(e)}")
            return self.max_depth

    async def download(self, url: str) -> Dict[str, Any]:
        """Download a documentation website"""
        try:
            # Reset state
            self.visited_urls.clear()
            self.current_domain = urlparse(url).netloc
            
            # Ensure we're using the configured output directory
            logger.info(f"Using output directory: {self.output_dir}")
            
            # Create site directory inside the output directory
            self.site_dir = self.output_dir / clean_filename(self.current_domain)
            logger.info(f"Creating site directory at: {self.site_dir}")
            self.site_dir.mkdir(exist_ok=True)
            
            logger.info(f"Starting download of {url} to {self.site_dir}")
            
            # Create clean directory structure
            assets_dir = self.site_dir / "assets"
            assets_dir.mkdir(exist_ok=True)
            for dir_name in ["css", "js", "images", "fonts", "other"]:
                (assets_dir / dir_name).mkdir(exist_ok=True)

            # Configure session            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5'
            }
            
            async with aiohttp.ClientSession(headers=headers) as session:
                # Analyze site and set depth
                self.max_depth = await self._analyze_site_structure(session, url)
                logger.info(f"Using max depth of {self.max_depth} for this site")
                
                # Start download
                await self._process_page(session, url)
                
            # Create index
            index = {
                "url": url,
                "domain": self.current_domain,
                "pages": len(self.visited_urls),
                "path": str(self.site_dir),
                "max_depth_used": self.max_depth
            }
            
            index_path = self.site_dir / "rag_index.json"
            with open(index_path, "w") as f:
                json.dump(index, f, indent=2)
            
            logger.info(f"Download complete. {len(self.visited_urls)} pages saved to {self.site_dir}")
            
            return {
                "status": "success",
                "path": str(self.site_dir),
                "pages": len(self.visited_urls),
                "depth_used": self.max_depth
            }
            
        except asyncio.CancelledError:
            logger.info("Download cancelled")
            raise
        except Exception as e:
            logger.error(f"Download failed: {str(e)}", exc_info=True)
            return {
                "status": "error",
                "error": str(e)
            }

    async def _process_page(self, session: aiohttp.ClientSession, url: str, depth: int = 0) -> Optional[str]:
        """Process a single page and its assets"""
        if url in self.visited_urls or depth > self.max_depth:
            return None
            
        self.visited_urls.add(url)
        logger.info(f"Processing {url} (depth {depth}/{self.max_depth})")
        
        try:
            async with session.get(url) as response:
                if response.status != 200:
                    logger.warning(f"Failed to get {url}: {response.status}")
                    return None
                    
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                # Save processed page first
                save_path = self._get_save_path(url)
                if not save_path:
                    logger.warning(f"Invalid save path for {url}")
                    return None
                    
                save_path.parent.mkdir(parents=True, exist_ok=True)
                
                # Handle assets before saving page
                await self._handle_assets(session, soup, url)
                
                # Process internal links
                await self._process_links(session, soup, url, depth)
                
                with open(save_path, "w", encoding="utf-8") as f:
                    f.write(str(soup))
                    
                return str(save_path.relative_to(self.site_dir))
                
        except Exception as e:
            logger.warning(f"Error processing {url}: {str(e)}")
            return None

    async def _handle_assets(self, session: aiohttp.ClientSession, soup: BeautifulSoup, base_url: str):
        """Download and update page assets"""
        for tag, attr in [("link", "href"), ("script", "src"), ("img", "src")]:
            for elem in soup.find_all(tag, {attr: True}):
                src = elem[attr]
                if src.startswith(("data:", "blob:", "javascript:", "#", "mailto:")):
                    continue
                    
                try:
                    full_url = urljoin(base_url, src)
                    if urlparse(full_url).netloc != self.current_domain:
                        continue
                        
                    async with session.get(full_url) as response:
                        if response.status == 200:
                            content = await response.read()
                            save_path = self._save_asset(full_url, content)
                            if save_path:
                                elem[attr] = str(save_path)
                except Exception as e:
                    logger.warning(f"Asset error ({src}): {str(e)}")

    async def _process_links(self, session: aiohttp.ClientSession, soup: BeautifulSoup, url: str, depth: int):
        """Process and update page links"""
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith(("#", "mailto:", "tel:", "javascript:")):
                continue
                
            try:
                full_url = urljoin(url, href)
                if urlparse(full_url).netloc == self.current_domain:
                    if new_path := await self._process_page(session, full_url, depth + 1):
                        a["href"] = f"/{new_path}"
            except Exception as e:
                logger.warning(f"Link error ({href}): {str(e)}")

    def _save_asset(self, url: str, content: bytes) -> Optional[Path]:
        """Save an asset file"""
        try:
            # Get clean filename from URL
            path = urlparse(url).path.lstrip("/")
            if not path:
                return None
                
            filename = clean_filename(unquote(path))
            
            # Determine asset type and directory
            if url.endswith((".css", ".scss")):
                asset_dir = "css"
            elif url.endswith((".js", ".mjs")):
                asset_dir = "js"
            elif url.endswith((".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp")):
                asset_dir = "images"
            elif url.endswith((".woff", ".woff2", ".ttf", ".eot")):
                asset_dir = "fonts"
            else:
                asset_dir = "other"
                
            # Create relative asset path ensuring it's under the configured directory
            rel_path = Path("assets") / asset_dir / filename
            full_path = self.site_dir / rel_path
            
            # Ensure we're not trying to write outside the site directory
            if not str(full_path).startswith(str(self.site_dir)):
                logger.warning(f"Attempted to write asset outside site directory: {full_path}")
                return None
            
            # Ensure parent directory exists
            full_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Write content
            with open(full_path, "wb") as f:
                f.write(content)
                
            return rel_path
            
        except Exception as e:
            logger.warning(f"Failed to save asset {url}: {str(e)}")
            return None

    def _get_save_path(self, url: str) -> Optional[Path]:
        """Get file system path for saving page"""
        try:
            # Get clean path from URL
            path = urlparse(url).path.lstrip("/")
            if not path:
                path = "index.html"
            elif not path.endswith((".html", ".htm")):
                path = f"{path}.html"
                
            # Clean the path and create Path object
            clean_path = clean_filename(unquote(path))
            save_path = self.site_dir / clean_path
            # user is able to save wherever they write in their app json- this is unneeded:
            # Safety check - ensure we're not trying to write outside site directory
            if not str(save_path).startswith(str(self.site_dir)):
                logger.warning(f"Attempted to save page outside site directory: {save_path}")
                return None
                
            return save_path
            
        except Exception as e:
            logger.warning(f"Invalid save path for {url}: {str(e)}")
            return None

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/angrysky56/mcp-windows-website-downloader'

If you have feedback or need assistance with the MCP directory API, please join our Discord server