download

download

Download entire documentation websites for offline RAG indexing by providing a URL, enabling configurable depth and asset capture.

Instructions

Download documentation website for RAG indexing

Input Schema

TableJSON Schema

Name	Required	Description	Default
`url`	Yes	Documentation site URL

Implementation Reference

src/mcp_windows_website_downloader/server.py:40-60 (registration)
Registers the 'download' MCP tool and defines its input schema (requires 'url' parameter).
@self.server.list_tools() async def handle_list_tools() -> List[types.Tool]: logger.info("Listing tools") tools = [ types.Tool( name="download", description="Download documentation website for RAG indexing", inputSchema={ "type": "object", "properties": { "url": { "type": "string", "description": "Documentation site URL" } }, "required": ["url"] } ) ] logger.info(f"Returning {len(tools)} tools") return tools
src/mcp_windows_website_downloader/server.py:71-103 (handler)
MCP call_tool handler for 'download': validates input, creates async task calling WebsiteDownloader.download(url), returns result.
if name != "download": raise ValueError(f"Unknown tool: {name}") if not arguments or "url" not in arguments: raise ValueError("URL is required") url = arguments["url"] # Create download task with progress tracking async def download_with_progress(): try: logger.info(f"Starting download of {url}") result = await self.downloader.download(url) logger.info("Download complete") return result except asyncio.CancelledError: logger.info("Download task cancelled") raise except Exception as e: logger.error(f"Download failed: {str(e)}") raise task = asyncio.create_task(download_with_progress()) self._tasks.add(task) try: result = await task finally: self._tasks.remove(task) return [types.TextContent( type="text", text=str(result) )]
src/mcp_windows_website_downloader/downloader.py:88-157 (handler)
Core implementation of website download: analyzes site, crawls pages up to max depth, downloads and rewrites assets/links, saves to directory with index.
async def download(self, url: str) -> Dict[str, Any]: """Download a documentation website""" try: # Reset state self.visited_urls.clear() self.current_domain = urlparse(url).netloc # Ensure we're using the configured output directory logger.info(f"Using output directory: {self.output_dir}") # Create site directory inside the output directory self.site_dir = self.output_dir / clean_filename(self.current_domain) logger.info(f"Creating site directory at: {self.site_dir}") self.site_dir.mkdir(exist_ok=True) logger.info(f"Starting download of {url} to {self.site_dir}") # Create clean directory structure assets_dir = self.site_dir / "assets" assets_dir.mkdir(exist_ok=True) for dir_name in ["css", "js", "images", "fonts", "other"]: (assets_dir / dir_name).mkdir(exist_ok=True) # Configure session headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5' } async with aiohttp.ClientSession(headers=headers) as session: # Analyze site and set depth self.max_depth = await self._analyze_site_structure(session, url) logger.info(f"Using max depth of {self.max_depth} for this site") # Start download await self._process_page(session, url) # Create index index = { "url": url, "domain": self.current_domain, "pages": len(self.visited_urls), "path": str(self.site_dir), "max_depth_used": self.max_depth } index_path = self.site_dir / "rag_index.json" with open(index_path, "w") as f: json.dump(index, f, indent=2) logger.info(f"Download complete. {len(self.visited_urls)} pages saved to {self.site_dir}") return { "status": "success", "path": str(self.site_dir), "pages": len(self.visited_urls), "depth_used": self.max_depth } except asyncio.CancelledError: logger.info("Download cancelled") raise except Exception as e: logger.error(f"Download failed: {str(e)}", exc_info=True) return { "status": "error", "error": str(e) }
src/mcp_windows_website_downloader/server.py:44-57 (schema)
JSON Schema definition for the 'download' tool input: object with required 'url' string.
types.Tool( name="download", description="Download documentation website for RAG indexing", inputSchema={ "type": "object", "properties": { "url": { "type": "string", "description": "Documentation site URL" } }, "required": ["url"] } )

MCP Windows Website Downloader Server

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API