Skip to main content
Glama
anton-prosterity

Documentation Search MCP Server

web_scraper.py4.21 kB
#!/usr/bin/env python3 """ An advanced web scraper using Playwright to handle dynamic, JS-heavy sites. """ import sys from typing import Optional from bs4 import BeautifulSoup import httpx from playwright.async_api import Browser, async_playwright class PlaywrightScraper: """A web scraper that uses a real browser to render pages.""" _browser: Optional[Browser] = None _playwright = None _disabled_reason: Optional[str] = None async def _get_browser(self) -> Browser: """Initializes and returns a shared browser instance.""" if self._disabled_reason: raise RuntimeError(self._disabled_reason) if self._browser is None or not self._browser.is_connected(): try: self._playwright = await async_playwright().start() self._browser = await self._playwright.chromium.launch() except Exception as e: self._disabled_reason = f"Playwright disabled: {e}" if self._playwright: try: await self._playwright.stop() except Exception: pass self._playwright = None self._browser = None raise return self._browser async def scrape_url(self, url: str) -> str: """ Scrapes a URL using Playwright, returning the clean, readable text content. This method can handle dynamic content, as it waits for the page to fully load and can execute scripts if needed. """ page = None try: if self._disabled_reason: return await self._scrape_url_fallback(url) browser = await self._get_browser() page = await browser.new_page() # Navigate to the page and wait for it to be fully loaded await page.goto(url, wait_until="networkidle", timeout=60000) # Scroll to the bottom to trigger lazy-loaded content await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) # Wait for any new content to load # Get the page content after JavaScript has rendered html_content = await page.content() # Use BeautifulSoup to parse and clean the final HTML soup = BeautifulSoup(html_content, "html.parser") # Remove non-content elements for element in soup( ["script", "style", "nav", "footer", "header", "aside"] ): element.decompose() # Get clean text text = soup.get_text(separator=" ", strip=True) return text except Exception as e: print(f"Failed to scrape {url}: {e}", file=sys.stderr) return await self._scrape_url_fallback(url) finally: if page is not None: await page.close() async def _scrape_url_fallback(self, url: str) -> str: """Fallback fetcher when Playwright cannot launch (e.g., sandboxed environments).""" headers = {"User-Agent": "docs-app/1.0"} try: async with httpx.AsyncClient( timeout=httpx.Timeout(15.0, read=30.0), follow_redirects=True, headers=headers, ) as client: response = await client.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") for element in soup( ["script", "style", "nav", "footer", "header", "aside"] ): element.decompose() return soup.get_text(separator=" ", strip=True) except Exception as e: print(f"Fallback fetch failed for {url}: {e}", file=sys.stderr) return f"Error: Could not retrieve content from {url}." async def close(self): """Closes the browser instance.""" if self._browser and self._browser.is_connected(): await self._browser.close() if self._playwright: await self._playwright.stop() scraper = PlaywrightScraper()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/anton-prosterity/documentation-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server