MCP Windows Website Downloader Server
by angrysky56
- mcp-windows-website-downloader
- src
- mcp_windows_website_downloader
"""
Core website downloading functionality.
"""
import logging
import asyncio
from pathlib import Path
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import json
import re
from typing import Dict, Any, Optional, Set
from .utils import clean_filename
logger = logging.getLogger(__name__)
class WebsiteDownloader:
"""Downloads and processes documentation websites"""
def __init__(self, output_dir: Path):
self.output_dir = output_dir
logger.info(f"Downloader initialized with output directory: {self.output_dir}")
if not self.output_dir.exists():
self.output_dir.mkdir(parents=True)
logger.info(f"Created output directory at {self.output_dir}")
self.visited_urls = set()
self.current_domain = None
self.site_dir = None
self.max_depth = 2 # Default, will be adjusted based on site analysis
async def _analyze_site_structure(self, session: aiohttp.ClientSession, url: str) -> int:
"""
Analyze the site structure to determine appropriate crawl depth.
Returns recommended max depth.
"""
try:
logger.info("Analyzing site structure...")
async with session.get(url) as response:
if response.status != 200:
return self.max_depth
content = await response.text()
soup = BeautifulSoup(content, "html.parser")
# Look for common documentation patterns
nav_elements = soup.find_all(['nav', 'sidebar', 'menu', 'toc'])
has_nav = len(nav_elements) > 0
# Check URL patterns
path = urlparse(url).path
is_docs_url = any(x in path.lower() for x in ['/docs/', '/documentation/', '/guide/', '/tutorial/'])
# Check for documentation frameworks
is_sphinx = bool(soup.find('div', {'class': 'sphinxsidebar'}))
is_mkdocs = bool(soup.find('nav', {'class': 'md-nav'}))
is_docusaurus = bool(soup.find('nav', {'class': 'menu'}))
# Analyze link structure
links = set()
for a in soup.find_all('a', href=True):
href = a['href']
if not href.startswith(('#', 'mailto:', 'tel:', 'javascript:')):
full_url = urljoin(url, href)
if urlparse(full_url).netloc == self.current_domain:
links.add(full_url)
# Determine appropriate depth
if is_sphinx or is_mkdocs or is_docusaurus:
# Known documentation sites usually need more depth
depth = 4
elif is_docs_url and has_nav:
# Looks like structured documentation
depth = 3
elif len(links) > 100:
# Large site, be conservative
depth = 2
else:
# Small or unknown site structure
depth = 2
logger.info(f"Site analysis complete. Recommended depth: {depth}")
return depth
except Exception as e:
logger.warning(f"Site analysis failed: {str(e)}")
return self.max_depth
async def download(self, url: str) -> Dict[str, Any]:
"""Download a documentation website"""
try:
# Reset state
self.visited_urls.clear()
self.current_domain = urlparse(url).netloc
# Ensure we're using the configured output directory
logger.info(f"Using output directory: {self.output_dir}")
# Create site directory inside the output directory
self.site_dir = self.output_dir / clean_filename(self.current_domain)
logger.info(f"Creating site directory at: {self.site_dir}")
self.site_dir.mkdir(exist_ok=True)
logger.info(f"Starting download of {url} to {self.site_dir}")
# Create clean directory structure
assets_dir = self.site_dir / "assets"
assets_dir.mkdir(exist_ok=True)
for dir_name in ["css", "js", "images", "fonts", "other"]:
(assets_dir / dir_name).mkdir(exist_ok=True)
# Configure session
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/91.0.4472.124',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
}
async with aiohttp.ClientSession(headers=headers) as session:
# Analyze site and set depth
self.max_depth = await self._analyze_site_structure(session, url)
logger.info(f"Using max depth of {self.max_depth} for this site")
# Start download
await self._process_page(session, url)
# Create index
index = {
"url": url,
"domain": self.current_domain,
"pages": len(self.visited_urls),
"path": str(self.site_dir),
"max_depth_used": self.max_depth
}
index_path = self.site_dir / "rag_index.json"
with open(index_path, "w") as f:
json.dump(index, f, indent=2)
logger.info(f"Download complete. {len(self.visited_urls)} pages saved to {self.site_dir}")
return {
"status": "success",
"path": str(self.site_dir),
"pages": len(self.visited_urls),
"depth_used": self.max_depth
}
except asyncio.CancelledError:
logger.info("Download cancelled")
raise
except Exception as e:
logger.error(f"Download failed: {str(e)}", exc_info=True)
return {
"status": "error",
"error": str(e)
}
async def _process_page(self, session: aiohttp.ClientSession, url: str, depth: int = 0) -> Optional[str]:
"""Process a single page and its assets"""
if url in self.visited_urls or depth > self.max_depth:
return None
self.visited_urls.add(url)
logger.info(f"Processing {url} (depth {depth}/{self.max_depth})")
try:
async with session.get(url) as response:
if response.status != 200:
logger.warning(f"Failed to get {url}: {response.status}")
return None
content = await response.text()
soup = BeautifulSoup(content, "html.parser")
# Save processed page first
save_path = self._get_save_path(url)
if not save_path:
logger.warning(f"Invalid save path for {url}")
return None
save_path.parent.mkdir(parents=True, exist_ok=True)
# Handle assets before saving page
await self._handle_assets(session, soup, url)
# Process internal links
await self._process_links(session, soup, url, depth)
with open(save_path, "w", encoding="utf-8") as f:
f.write(str(soup))
return str(save_path.relative_to(self.site_dir))
except Exception as e:
logger.warning(f"Error processing {url}: {str(e)}")
return None
async def _handle_assets(self, session: aiohttp.ClientSession, soup: BeautifulSoup, base_url: str):
"""Download and update page assets"""
for tag, attr in [("link", "href"), ("script", "src"), ("img", "src")]:
for elem in soup.find_all(tag, {attr: True}):
src = elem[attr]
if src.startswith(("data:", "blob:", "javascript:", "#", "mailto:")):
continue
try:
full_url = urljoin(base_url, src)
if urlparse(full_url).netloc != self.current_domain:
continue
async with session.get(full_url) as response:
if response.status == 200:
content = await response.read()
save_path = self._save_asset(full_url, content)
if save_path:
elem[attr] = str(save_path)
except Exception as e:
logger.warning(f"Asset error ({src}): {str(e)}")
async def _process_links(self, session: aiohttp.ClientSession, soup: BeautifulSoup, url: str, depth: int):
"""Process and update page links"""
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith(("#", "mailto:", "tel:", "javascript:")):
continue
try:
full_url = urljoin(url, href)
if urlparse(full_url).netloc == self.current_domain:
if new_path := await self._process_page(session, full_url, depth + 1):
a["href"] = f"/{new_path}"
except Exception as e:
logger.warning(f"Link error ({href}): {str(e)}")
def _save_asset(self, url: str, content: bytes) -> Optional[Path]:
"""Save an asset file"""
try:
# Get clean filename from URL
path = urlparse(url).path.lstrip("/")
if not path:
return None
filename = clean_filename(unquote(path))
# Determine asset type and directory
if url.endswith((".css", ".scss")):
asset_dir = "css"
elif url.endswith((".js", ".mjs")):
asset_dir = "js"
elif url.endswith((".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp")):
asset_dir = "images"
elif url.endswith((".woff", ".woff2", ".ttf", ".eot")):
asset_dir = "fonts"
else:
asset_dir = "other"
# Create relative asset path ensuring it's under the configured directory
rel_path = Path("assets") / asset_dir / filename
full_path = self.site_dir / rel_path
# Ensure we're not trying to write outside the site directory
if not str(full_path).startswith(str(self.site_dir)):
logger.warning(f"Attempted to write asset outside site directory: {full_path}")
return None
# Ensure parent directory exists
full_path.parent.mkdir(parents=True, exist_ok=True)
# Write content
with open(full_path, "wb") as f:
f.write(content)
return rel_path
except Exception as e:
logger.warning(f"Failed to save asset {url}: {str(e)}")
return None
def _get_save_path(self, url: str) -> Optional[Path]:
"""Get file system path for saving page"""
try:
# Get clean path from URL
path = urlparse(url).path.lstrip("/")
if not path:
path = "index.html"
elif not path.endswith((".html", ".htm")):
path = f"{path}.html"
# Clean the path and create Path object
clean_path = clean_filename(unquote(path))
save_path = self.site_dir / clean_path
# user is able to save wherever they write in their app json- this is unneeded:
# Safety check - ensure we're not trying to write outside site directory
if not str(save_path).startswith(str(self.site_dir)):
logger.warning(f"Attempted to save page outside site directory: {save_path}")
return None
return save_path
except Exception as e:
logger.warning(f"Invalid save path for {url}: {str(e)}")
return None