download

Download entire documentation websites for offline RAG indexing by providing a URL, enabling configurable depth and asset capture.

Instructions

Download documentation website for RAG indexing

Input Schema

TableJSON Schema

Name	Required	Description	Default
`url`	Yes	Documentation site URL

Implementation Reference

src/mcp_windows_website_downloader/server.py:40-60 (registration)

Registers the 'download' MCP tool and defines its input schema (requires 'url' parameter).

@self.server.list_tools()
async def handle_list_tools() -> List[types.Tool]:
    logger.info("Listing tools")
    tools = [
        types.Tool(
            name="download",
            description="Download documentation website for RAG indexing",
            inputSchema={
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "Documentation site URL"
                    }
                },
                "required": ["url"]
            }
        )
    ]
    logger.info(f"Returning {len(tools)} tools")
    return tools

src/mcp_windows_website_downloader/server.py:71-103 (handler)

MCP call_tool handler for 'download': validates input, creates async task calling WebsiteDownloader.download(url), returns result.

if name != "download":
    raise ValueError(f"Unknown tool: {name}")
    
if not arguments or "url" not in arguments:
    raise ValueError("URL is required")

url = arguments["url"]

# Create download task with progress tracking
async def download_with_progress():
    try:
        logger.info(f"Starting download of {url}")
        result = await self.downloader.download(url)
        logger.info("Download complete")
        return result
    except asyncio.CancelledError:
        logger.info("Download task cancelled")
        raise
    except Exception as e:
        logger.error(f"Download failed: {str(e)}")
        raise

task = asyncio.create_task(download_with_progress())
self._tasks.add(task)
try:
    result = await task
finally:
    self._tasks.remove(task)
    
return [types.TextContent(
    type="text", 
    text=str(result)
)]

src/mcp_windows_website_downloader/downloader.py:88-157 (handler)

Core implementation of website download: analyzes site, crawls pages up to max depth, downloads and rewrites assets/links, saves to directory with index.

async def download(self, url: str) -> Dict[str, Any]:
    """Download a documentation website"""
    try:
        # Reset state
        self.visited_urls.clear()
        self.current_domain = urlparse(url).netloc
        
        # Ensure we're using the configured output directory
        logger.info(f"Using output directory: {self.output_dir}")
        
        # Create site directory inside the output directory
        self.site_dir = self.output_dir / clean_filename(self.current_domain)
        logger.info(f"Creating site directory at: {self.site_dir}")
        self.site_dir.mkdir(exist_ok=True)
        
        logger.info(f"Starting download of {url} to {self.site_dir}")
        
        # Create clean directory structure
        assets_dir = self.site_dir / "assets"
        assets_dir.mkdir(exist_ok=True)
        for dir_name in ["css", "js", "images", "fonts", "other"]:
            (assets_dir / dir_name).mkdir(exist_ok=True)

        # Configure session            
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5'
        }
        
        async with aiohttp.ClientSession(headers=headers) as session:
            # Analyze site and set depth
            self.max_depth = await self._analyze_site_structure(session, url)
            logger.info(f"Using max depth of {self.max_depth} for this site")
            
            # Start download
            await self._process_page(session, url)
            
        # Create index
        index = {
            "url": url,
            "domain": self.current_domain,
            "pages": len(self.visited_urls),
            "path": str(self.site_dir),
            "max_depth_used": self.max_depth
        }
        
        index_path = self.site_dir / "rag_index.json"
        with open(index_path, "w") as f:
            json.dump(index, f, indent=2)
        
        logger.info(f"Download complete. {len(self.visited_urls)} pages saved to {self.site_dir}")
        
        return {
            "status": "success",
            "path": str(self.site_dir),
            "pages": len(self.visited_urls),
            "depth_used": self.max_depth
        }
        
    except asyncio.CancelledError:
        logger.info("Download cancelled")
        raise
    except Exception as e:
        logger.error(f"Download failed: {str(e)}", exc_info=True)
        return {
            "status": "error",
            "error": str(e)
        }

src/mcp_windows_website_downloader/server.py:44-57 (schema)

JSON Schema definition for the 'download' tool input: object with required 'url' string.

types.Tool(
    name="download",
    description="Download documentation website for RAG indexing",
    inputSchema={
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "Documentation site URL"
            }
        },
        "required": ["url"]
    }
)

MCP Windows Website Downloader Server

download

Instructions

Input Schema

Implementation Reference

Tool Definition Quality

Other Tools

Latest Blog Posts

MCP directory API