Skip to main content
Glama

Documentation Search MCP Server

web_scraper.py2.91 kB
#!/usr/bin/env python3 """ An advanced web scraper using Playwright to handle dynamic, JS-heavy sites. """ from playwright.async_api import async_playwright, Browser from bs4 import BeautifulSoup from typing import Optional class PlaywrightScraper: """A web scraper that uses a real browser to render pages.""" _browser: Optional[Browser] = None _playwright = None async def _get_browser(self) -> Browser: """Initializes and returns a shared browser instance.""" if self._browser is None or not self._browser.is_connected(): self._playwright = await async_playwright().start() self._browser = await self._playwright.chromium.launch() return self._browser async def scrape_url(self, url: str) -> str: """ Scrapes a URL using Playwright, returning the clean, readable text content. This method can handle dynamic content, as it waits for the page to fully load and can execute scripts if needed. """ browser = await self._get_browser() page = await browser.new_page() try: # Navigate to the page and wait for it to be fully loaded await page.goto(url, wait_until="networkidle", timeout=60000) # Additional waits can be added here if needed, for example: # await page.wait_for_selector("main-content-selector", timeout=5000) # Scroll to the bottom to trigger lazy-loaded content await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) # Wait for any new content to load # Get the page content after JavaScript has rendered html_content = await page.content() # Use BeautifulSoup to parse and clean the final HTML soup = BeautifulSoup(html_content, "html.parser") # Remove non-content elements for element in soup( ["script", "style", "nav", "footer", "header", "aside"] ): element.decompose() # Get clean text text = soup.get_text(separator=" ", strip=True) return text except Exception as e: print(f"Failed to scrape {url}: {e}") return f"Error: Could not retrieve content from {url}." finally: await page.close() async def close(self): """Closes the browser instance.""" if self._browser and self._browser.is_connected(): await self._browser.close() if self._playwright: await self._playwright.stop() # --- Global Scraper Instance --- scraper = PlaywrightScraper() # Recommended: Add a cleanup function to your main application's exit handler # to ensure the browser is closed gracefully. # Example: # @atexit.register # def cleanup(): # asyncio.run(scraper.close())

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/anton-prosterity/documentation-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server