Farnsworth

Overview Schema Related Servers Score Discussions

crawlee.py•3.38 KiB

""" Farnsworth Universal Scraper (Crawlee Integration) -------------------------------------------------- "Info-sucking mosquitoes, go!" This module uses Crawlee for Python (a port of the JS library) to robustly scrape dynamic content from socials and streaming platforms. It handles retry logic and browser fingerprinting automatically. """ import asyncio from typing import List, Dict, Optional from loguru import logger # Mocking Crawlee import since it might not be installed in the env yet # In prod: from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext try: from playwright.async_api import async_playwright CRAWLEE_AVAILABLE = True except ImportError: CRAWLEE_AVAILABLE = False logger.warning("Playwright/Crawlee dependencies missing. Install with 'pip install playwright crawlee'") class UniversalScraper: def __init__(self, headless: bool = True): self.headless = headless async def scrape_social_profile(self, url: str) -> Dict: """ Scrape public social media profiles (X, Instagram, LinkedIn) using Playwright/Crawlee logic to bypass basic bot detection. """ if not CRAWLEE_AVAILABLE: return {"error": "Dependencies missing"} logger.info(f"Scraper: Targeting {url}") # Simplified Logic (Real Crawlee adds request queues/proxy rotation) async with async_playwright() as p: # Use stealth args browser = await p.chromium.launch( headless=self.headless, args=["--disable-blink-features=AutomationControlled"] ) page = await browser.new_page() try: await page.goto(url, timeout=30000) await page.wait_for_load_state("networkidle") # Extract generic metadata title = await page.title() # Basic meta description desc = await page.evaluate("() => document.querySelector('meta[name=\"description\"]')?.content") # Social specific extraction (Heuristic) stats = await self._extract_stats(page, url) return { "url": url, "title": title, "description": desc, "stats": stats, "snapshot_status": "SUCCESS" } except Exception as e: logger.error(f"Scrape failed: {e}") return {"error": str(e)} finally: await browser.close() async def _extract_stats(self, page, url: str) -> Dict: """Heuristic extractor for follower counts etc.""" # This is brittle by nature, but works as a "Universal" attempt content = await page.content() # Very naive scraping logic for demo # In production this uses specific CSS selectors per domain return { "text_content_length": len(content), "links_found": len(await page.query_selector_all("a")) } async def scrape_video_metadata(self, url: str) -> Dict: """Scrape streaming platforms (Twitch/YouTube) for live stats.""" # Logic similar to social, but looking for 'live' indicators return await self.scrape_social_profile(url) universal_scraper = UniversalScraper()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/timowhite88/Farnsworth'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

crawlee.py•3.38 KiB