Skip to main content
Glama
angrysky56

MCP Windows Website Downloader Server

download

Download entire documentation websites for offline RAG indexing by providing a URL, enabling configurable depth and asset capture.

Instructions

Download documentation website for RAG indexing

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYesDocumentation site URL

Implementation Reference

  • Registers the 'download' MCP tool and defines its input schema (requires 'url' parameter).
    @self.server.list_tools()
    async def handle_list_tools() -> List[types.Tool]:
        logger.info("Listing tools")
        tools = [
            types.Tool(
                name="download",
                description="Download documentation website for RAG indexing",
                inputSchema={
                    "type": "object",
                    "properties": {
                        "url": {
                            "type": "string",
                            "description": "Documentation site URL"
                        }
                    },
                    "required": ["url"]
                }
            )
        ]
        logger.info(f"Returning {len(tools)} tools")
        return tools
  • MCP call_tool handler for 'download': validates input, creates async task calling WebsiteDownloader.download(url), returns result.
    if name != "download":
        raise ValueError(f"Unknown tool: {name}")
        
    if not arguments or "url" not in arguments:
        raise ValueError("URL is required")
    
    url = arguments["url"]
    
    # Create download task with progress tracking
    async def download_with_progress():
        try:
            logger.info(f"Starting download of {url}")
            result = await self.downloader.download(url)
            logger.info("Download complete")
            return result
        except asyncio.CancelledError:
            logger.info("Download task cancelled")
            raise
        except Exception as e:
            logger.error(f"Download failed: {str(e)}")
            raise
    
    task = asyncio.create_task(download_with_progress())
    self._tasks.add(task)
    try:
        result = await task
    finally:
        self._tasks.remove(task)
        
    return [types.TextContent(
        type="text", 
        text=str(result)
    )]
  • Core implementation of website download: analyzes site, crawls pages up to max depth, downloads and rewrites assets/links, saves to directory with index.
    async def download(self, url: str) -> Dict[str, Any]:
        """Download a documentation website"""
        try:
            # Reset state
            self.visited_urls.clear()
            self.current_domain = urlparse(url).netloc
            
            # Ensure we're using the configured output directory
            logger.info(f"Using output directory: {self.output_dir}")
            
            # Create site directory inside the output directory
            self.site_dir = self.output_dir / clean_filename(self.current_domain)
            logger.info(f"Creating site directory at: {self.site_dir}")
            self.site_dir.mkdir(exist_ok=True)
            
            logger.info(f"Starting download of {url} to {self.site_dir}")
            
            # Create clean directory structure
            assets_dir = self.site_dir / "assets"
            assets_dir.mkdir(exist_ok=True)
            for dir_name in ["css", "js", "images", "fonts", "other"]:
                (assets_dir / dir_name).mkdir(exist_ok=True)
    
            # Configure session            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5'
            }
            
            async with aiohttp.ClientSession(headers=headers) as session:
                # Analyze site and set depth
                self.max_depth = await self._analyze_site_structure(session, url)
                logger.info(f"Using max depth of {self.max_depth} for this site")
                
                # Start download
                await self._process_page(session, url)
                
            # Create index
            index = {
                "url": url,
                "domain": self.current_domain,
                "pages": len(self.visited_urls),
                "path": str(self.site_dir),
                "max_depth_used": self.max_depth
            }
            
            index_path = self.site_dir / "rag_index.json"
            with open(index_path, "w") as f:
                json.dump(index, f, indent=2)
            
            logger.info(f"Download complete. {len(self.visited_urls)} pages saved to {self.site_dir}")
            
            return {
                "status": "success",
                "path": str(self.site_dir),
                "pages": len(self.visited_urls),
                "depth_used": self.max_depth
            }
            
        except asyncio.CancelledError:
            logger.info("Download cancelled")
            raise
        except Exception as e:
            logger.error(f"Download failed: {str(e)}", exc_info=True)
            return {
                "status": "error",
                "error": str(e)
            }
  • JSON Schema definition for the 'download' tool input: object with required 'url' string.
    types.Tool(
        name="download",
        description="Download documentation website for RAG indexing",
        inputSchema={
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "Documentation site URL"
                }
            },
            "required": ["url"]
        }
    )
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/angrysky56/mcp-windows-website-downloader'

If you have feedback or need assistance with the MCP directory API, please join our Discord server