DevLens MCP

scraper.py•10.9 KiB

"""Web scraping adapter.""" import re from datetime import datetime from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup from readability import Document as ReadabilityDocument from devlens.models.document import Document, PageSummary, Section from devlens.models.errors import ScrapingError class ScraperAdapter: """Adapter for web scraping using crawl4ai with httpx fallback.""" def __init__(self, timeout: float = 30.0): """Initialize scraper adapter. Args: timeout: Request timeout in seconds. """ self._timeout = timeout self._headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } self._crawl4ai_available = True async def fetch(self, url: str, *, retry: int = 2) -> Document: """Fetch and parse a URL into a Document. Uses crawl4ai for JS-heavy pages, falls back to httpx+readability. Args: url: URL to fetch. retry: Number of retry attempts on failure. Returns: Document with markdown content. Raises: ScrapingError: If fetching or parsing fails after all retries. """ if not url.strip(): raise ScrapingError(url, "URL cannot be empty") # Validate URL format parsed = urlparse(url) if not parsed.scheme or not parsed.netloc: raise ScrapingError(url, "Invalid URL format") last_error = None for attempt in range(retry + 1): try: # Try crawl4ai first for better JS support if self._crawl4ai_available: try: return await self._fetch_with_crawl4ai(url) except Exception as e: last_error = e # Fall back to httpx if crawl4ai fails pass # Fallback to httpx + readability return await self._fetch_with_httpx(url) except ScrapingError as e: if attempt == retry: raise last_error = e # Wait before retry (exponential backoff) import asyncio await asyncio.sleep(2**attempt) except Exception as e: last_error = e if attempt == retry: raise ScrapingError( url, f"Failed after {retry + 1} attempts: {e}" ) from e await asyncio.sleep(2**attempt) raise ScrapingError(url, f"Failed after all retries: {last_error}") async def _fetch_with_crawl4ai(self, url: str) -> Document: """Fetch using crawl4ai (handles JavaScript). Args: url: URL to fetch. Returns: Document with markdown content. """ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig browser_config = BrowserConfig( headless=True, verbose=False, ) run_config = CrawlerRunConfig( wait_until="domcontentloaded", page_timeout=int(self._timeout * 1000), ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) if not result.success: raise ScrapingError(url, result.error_message or "Crawl failed") title = result.metadata.get("title", "") if result.metadata else "" content = result.markdown or "" # Add source attribution markdown = f"# {title}\n\n> Source: {url}\n\n{content}" return Document( url=url, title=title, content=markdown, fetched_at=datetime.now(), ) async def _fetch_with_httpx(self, url: str) -> Document: """Fetch using httpx + readability (fallback for static pages). Args: url: URL to fetch. Returns: Document with markdown content. """ try: async with httpx.AsyncClient( timeout=self._timeout, follow_redirects=True ) as client: response = await client.get(url, headers=self._headers) response.raise_for_status() html = response.text # Use readability to extract main content doc = ReadabilityDocument(html) title = doc.title() content_html = doc.summary() # Convert to markdown-ish format content = self._html_to_markdown(content_html) # Add source attribution markdown = f"# {title}\n\n> Source: {url}\n\n{content}" return Document( url=url, title=title, content=markdown, fetched_at=datetime.now(), ) except httpx.TimeoutException: raise ScrapingError(url, "Request timed out") except httpx.HTTPStatusError as e: raise ScrapingError(url, f"HTTP {e.response.status_code}") except Exception as e: raise ScrapingError(url, str(e)) from e async def summarize(self, url: str) -> PageSummary: """Extract page structure without full content. Args: url: URL to summarize. Returns: PageSummary with sections. Raises: ScrapingError: If fetching or parsing fails. """ # Use crawl4ai for better JS support if self._crawl4ai_available: try: return await self._summarize_with_crawl4ai(url) except Exception: pass # Fallback return await self._summarize_with_httpx(url) async def _summarize_with_crawl4ai(self, url: str) -> PageSummary: """Summarize using crawl4ai.""" from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig( wait_until="domcontentloaded", page_timeout=int(self._timeout * 1000), ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) if not result.success: raise ScrapingError(url, result.error_message or "Crawl failed") title = result.metadata.get("title", "") if result.metadata else "" html = result.html or "" soup = BeautifulSoup(html, "html.parser") sections: list[Section] = [] for heading in soup.find_all(["h1", "h2", "h3"]): text = heading.get_text(strip=True) if text: summary = "" next_elem = heading.find_next_sibling(["p", "div"]) if next_elem: summary = next_elem.get_text(strip=True)[:200] sections.append(Section(heading=text, summary=summary)) return PageSummary(url=url, title=title, sections=sections) async def _summarize_with_httpx(self, url: str) -> PageSummary: """Summarize using httpx (fallback).""" try: async with httpx.AsyncClient( timeout=self._timeout, follow_redirects=True ) as client: response = await client.get(url, headers=self._headers) response.raise_for_status() html = response.text soup = BeautifulSoup(html, "html.parser") title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "" sections: list[Section] = [] for heading in soup.find_all(["h1", "h2", "h3"]): text = heading.get_text(strip=True) if text: summary = "" next_elem = heading.find_next_sibling(["p", "div"]) if next_elem: summary = next_elem.get_text(strip=True)[:200] sections.append(Section(heading=text, summary=summary)) return PageSummary(url=url, title=title, sections=sections) except httpx.TimeoutException: raise ScrapingError(url, "Request timed out") except httpx.HTTPStatusError as e: raise ScrapingError(url, f"HTTP {e.response.status_code}") except Exception as e: raise ScrapingError(url, str(e)) from e def _html_to_markdown(self, html: str) -> str: """Convert HTML to basic markdown.""" soup = BeautifulSoup(html, "html.parser") for elem in soup(["script", "style", "nav", "footer", "header"]): elem.decompose() for i, tag in enumerate(["h1", "h2", "h3", "h4", "h5", "h6"]): for heading in soup.find_all(tag): prefix = "#" * (i + 1) heading.replace_with(f"\n\n{prefix} {heading.get_text(strip=True)}\n\n") for link in soup.find_all("a"): href = link.get("href", "") text = link.get_text(strip=True) if href and text: link.replace_with(f"[{text}]({href})") for tag in soup.find_all(["strong", "b"]): tag.replace_with(f"**{tag.get_text(strip=True)}**") for tag in soup.find_all(["em", "i"]): tag.replace_with(f"*{tag.get_text(strip=True)}*") for tag in soup.find_all("code"): tag.replace_with(f"`{tag.get_text(strip=True)}`") for ul in soup.find_all("ul"): items = ul.find_all("li") md_list = "\n".join(f"- {li.get_text(strip=True)}" for li in items) ul.replace_with(f"\n{md_list}\n") for ol in soup.find_all("ol"): items = ol.find_all("li") md_list = "\n".join( f"{i + 1}. {li.get_text(strip=True)}" for i, li in enumerate(items) ) ol.replace_with(f"\n{md_list}\n") text = soup.get_text() text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r" {2,}", " ", text) return text.strip() def get_same_domain_links(self, html: str, base_url: str) -> list[str]: """Extract same-domain links from HTML.""" soup = BeautifulSoup(html, "html.parser") base_domain = urlparse(base_url).netloc links: list[str] = [] for a in soup.find_all("a", href=True): href = a["href"] absolute_url = urljoin(base_url, href) parsed = urlparse(absolute_url) if ( parsed.netloc == base_domain and parsed.scheme in ("http", "https") and parsed.path ): clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" if clean_url not in links: links.append(clean_url) return links

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Y4NN777/devlens-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scraper.py•10.9 KiB