MCPeasy

mcpeasy
src
services

firecrawl.py•9.13 KiB

""" Firecrawl service for web scraping and crawling """ import os from typing import Any, Dict, List, Optional try: from firecrawl import AsyncFirecrawlApp FIRECRAWL_AVAILABLE = True except ImportError: FIRECRAWL_AVAILABLE = False class FirecrawlService: """Service for interacting with Firecrawl API""" def __init__(self, api_key: Optional[str] = None): if not FIRECRAWL_AVAILABLE: raise ImportError("Firecrawl library not available. Install with: pip install firecrawl-py") self.api_key = api_key or os.getenv("FIRECRAWL_API_KEY") if not self.api_key: raise ValueError("Firecrawl API key is required") self._client = AsyncFirecrawlApp(api_key=self.api_key) async def scrape_url( self, url: str, formats: Optional[List[str]] = None, actions: Optional[List[Dict[str, Any]]] = None, agent: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """ Scrape a single URL using Firecrawl Args: url: URL to scrape formats: List of formats to return (markdown, html, rawHtml, links, screenshot) actions: Browser actions to perform before scraping agent: Agent configuration for FIRE-1 browsing Returns: Dictionary with scraped data """ params = {} # Set formats - keep it simple if formats: params["formats"] = formats # Browser actions if actions: params["actions"] = actions if agent: params["agent"] = agent # Perform scraping result = await self._client.scrape_url(url, **params) # Extract data from ScrapeResponse object response_data = { "success": getattr(result, 'success', True), "url": url } # Add available data if hasattr(result, 'markdown'): response_data["markdown"] = result.markdown if hasattr(result, 'html'): response_data["html"] = result.html if hasattr(result, 'rawHtml'): response_data["rawHtml"] = result.rawHtml if hasattr(result, 'links'): response_data["links"] = result.links if hasattr(result, 'screenshot'): response_data["screenshot"] = result.screenshot if hasattr(result, 'metadata'): response_data["metadata"] = result.metadata if hasattr(result, 'extract'): response_data["extract"] = result.extract return response_data async def take_screenshot( self, url: str, actions: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: """ Take a screenshot of a URL using Firecrawl Args: url: URL to screenshot actions: Browser actions to perform before taking screenshot Returns: Dictionary with screenshot data and URL """ params = { "formats": ["screenshot"] } # Browser actions if actions: params["actions"] = actions # Perform scraping for screenshot result = await self._client.scrape_url(url, **params) # Extract screenshot data response_data = { "success": getattr(result, 'success', True), "url": url, "screenshot": getattr(result, 'screenshot', None) } return response_data ### NB!!! this is broken for now because async version of firecrawl client is not yet supporting agent. it will, so let's wait. async def browse_with_agent( self, url: str, instructions: str, formats: Optional[List[str]] = None ) -> Dict[str, Any]: """ Use FIRE-1 AI agent to browse and interact with websites Args: url: Starting URL to browse instructions: Detailed instructions for the agent on what to do formats: List of formats to return (markdown, html, etc.) Returns: Dictionary with browsing results """ # Set default formats formats = formats or ["markdown"] print(f"Scraping URL: {url} with instructions: {instructions}") print(f"Formats: {formats}") print(f"Instructions: {instructions}") # Use the service's own scrape_url method with agent result = await self.scrape_url( url, formats=formats, agent={ 'model': 'FIRE-1', 'prompt': instructions } ) # Add instructions to the response result["instructions"] = instructions return result async def crawl_url( self, url: str, formats: Optional[List[str]] = None, limit: int = 10, include_paths: Optional[List[str]] = None, exclude_paths: Optional[List[str]] = None, max_depth: Optional[int] = None, allow_backward_links: bool = False, allow_external_links: bool = False, delay: Optional[int] = None, only_main_content: bool = True, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, headers: Optional[Dict[str, str]] = None ) -> Dict[str, Any]: """ Crawl a website using Firecrawl Args: url: Starting URL to crawl formats: List of formats to return for each page limit: Maximum number of pages to crawl include_paths: Regex patterns for URLs to include exclude_paths: Regex patterns for URLs to exclude max_depth: Maximum crawl depth allow_backward_links: Whether to allow crawling backward links allow_external_links: Whether to allow crawling external links delay: Delay between requests in milliseconds only_main_content: Whether to extract only main content include_tags: HTML tags/classes/ids to include exclude_tags: HTML tags/classes/ids to exclude headers: HTTP headers to send Returns: Dictionary with crawled data """ params = { "limit": limit } # Set formats if formats: params["formats"] = formats else: params["formats"] = ["markdown"] # Crawl options if include_paths: params["includePaths"] = include_paths if exclude_paths: params["excludePaths"] = exclude_paths if max_depth is not None: params["maxDepth"] = max_depth if allow_backward_links: params["allowBackwardLinks"] = allow_backward_links if allow_external_links: params["allowExternalLinks"] = allow_external_links if delay is not None: params["delay"] = delay # Content selection params["onlyMainContent"] = only_main_content # Tag filtering if include_tags: params["includeTags"] = include_tags if exclude_tags: params["excludeTags"] = exclude_tags # Headers if headers: params["headers"] = headers # Perform crawling result = await self._client.crawl_url(url, **params) # Extract data from CrawlResponse object success = getattr(result, 'success', True) data = getattr(result, 'data', []) if hasattr(result, 'data') else [] # Build response pages = [] for page in data: page_data = {"url": page.get("url", "")} # Add available data for each page if "markdown" in page: page_data["markdown"] = page["markdown"] if "html" in page: page_data["html"] = page["html"] if "rawHtml" in page: page_data["rawHtml"] = page["rawHtml"] if "links" in page: page_data["links"] = page["links"] if "screenshot" in page: page_data["screenshot"] = page["screenshot"] if "metadata" in page: page_data["metadata"] = page["metadata"] if "extract" in page: page_data["extract"] = page["extract"] pages.append(page_data) return { "success": success, "start_url": url, "total_pages": len(pages), "crawl_limit": limit, "pages": pages } def get_firecrawl_service(api_key: Optional[str] = None) -> Optional[FirecrawlService]: """ Get a Firecrawl service instance Args: api_key: Optional API key. If not provided, will try to get from environment Returns: FirecrawlService instance or None if not available """ try: return FirecrawlService(api_key) except (ImportError, ValueError): return None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/GeorgeStrakhov/mcpeasy'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

firecrawl.py•9.13 KiB