MCP TODO Checklist Server

  • src
  • mcp_windows_website_downloader
"""Core downloader functionality for the website downloader.""" import os import asyncio import logging from typing import List, Dict, Optional import aiohttp from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from ..utils.validators import is_valid_url, is_downloadable_url from ..utils.file_handlers import save_content, create_directory_structure logger = logging.getLogger(__name__) class WebsiteDownloader: def __init__(self, base_url: str, output_dir: str, max_depth: int = 2, concurrent_downloads: int = 5, include_media: bool = True): """Initialize the website downloader.""" self.base_url = base_url self.output_dir = output_dir self.max_depth = max_depth self.concurrent_downloads = concurrent_downloads self.include_media = include_media self.visited_urls = set() self.session = None self.semaphore = None async def __aenter__(self): """Set up async context.""" self.session = aiohttp.ClientSession() self.semaphore = asyncio.Semaphore(self.concurrent_downloads) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Clean up async context.""" if self.session: await self.session.close() async def download_page(self, url: str, depth: int = 0) -> Optional[str]: """Download a single page and its assets.""" if not is_valid_url(url) or url in self.visited_urls or depth > self.max_depth: return None self.visited_urls.add(url) try: async with self.semaphore: async with self.session.get(url) as response: if response.status != 200: logger.warning(f"Failed to download {url}: {response.status}") return None content = await response.text() # Parse and process content soup = BeautifulSoup(content, 'html.parser') await self._process_assets(soup, url) # Save the modified content relative_path = urlparse(url).path.lstrip('/') save_path = os.path.join(self.output_dir, relative_path) save_content(save_path, str(soup)) return content except Exception as e: logger.error(f"Error downloading {url}: {str(e)}") return None async def _process_assets(self, soup: BeautifulSoup, base_url: str): """Process and download page assets.""" tasks = [] # Process images if self.include_media: for img in soup.find_all('img'): src = img.get('src') if src: absolute_url = urljoin(base_url, src) if is_downloadable_url(absolute_url): tasks.append(self._download_asset(absolute_url)) # Process stylesheets for link in soup.find_all('link', rel='stylesheet'): href = link.get('href') if href: absolute_url = urljoin(base_url, href) if is_downloadable_url(absolute_url): tasks.append(self._download_asset(absolute_url)) # Process scripts for script in soup.find_all('script', src=True): src = script.get('src') if src: absolute_url = urljoin(base_url, src) if is_downloadable_url(absolute_url): tasks.append(self._download_asset(absolute_url)) if tasks: await asyncio.gather(*tasks) async def _download_asset(self, url: str): """Download an asset file.""" try: async with self.semaphore: async with self.session.get(url) as response: if response.status == 200: content = await response.read() relative_path = urlparse(url).path.lstrip('/') save_path = os.path.join(self.output_dir, 'assets', relative_path) save_content(save_path, content, binary=True) except Exception as e: logger.error(f"Error downloading asset {url}: {str(e)}") async def start(self): """Start the download process.""" logger.info(f"Starting download of {self.base_url}") create_directory_structure(self.output_dir) await self.download_page(self.base_url)