Skip to main content
Glama
firecrawl.py9.35 kB
""" Firecrawl service for web scraping and crawling """ import os from typing import Any, Dict, List, Optional try: from firecrawl import AsyncFirecrawlApp FIRECRAWL_AVAILABLE = True except ImportError: FIRECRAWL_AVAILABLE = False class FirecrawlService: """Service for interacting with Firecrawl API""" def __init__(self, api_key: Optional[str] = None): if not FIRECRAWL_AVAILABLE: raise ImportError("Firecrawl library not available. Install with: pip install firecrawl-py") self.api_key = api_key or os.getenv("FIRECRAWL_API_KEY") if not self.api_key: raise ValueError("Firecrawl API key is required") self._client = AsyncFirecrawlApp(api_key=self.api_key) async def scrape_url( self, url: str, formats: Optional[List[str]] = None, actions: Optional[List[Dict[str, Any]]] = None, agent: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """ Scrape a single URL using Firecrawl Args: url: URL to scrape formats: List of formats to return (markdown, html, rawHtml, links, screenshot) actions: Browser actions to perform before scraping agent: Agent configuration for FIRE-1 browsing Returns: Dictionary with scraped data """ params = {} # Set formats - keep it simple if formats: params["formats"] = formats # Browser actions if actions: params["actions"] = actions if agent: params["agent"] = agent # Perform scraping result = await self._client.scrape_url(url, **params) # Extract data from ScrapeResponse object response_data = { "success": getattr(result, 'success', True), "url": url } # Add available data if hasattr(result, 'markdown'): response_data["markdown"] = result.markdown if hasattr(result, 'html'): response_data["html"] = result.html if hasattr(result, 'rawHtml'): response_data["rawHtml"] = result.rawHtml if hasattr(result, 'links'): response_data["links"] = result.links if hasattr(result, 'screenshot'): response_data["screenshot"] = result.screenshot if hasattr(result, 'metadata'): response_data["metadata"] = result.metadata if hasattr(result, 'extract'): response_data["extract"] = result.extract return response_data async def take_screenshot( self, url: str, actions: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: """ Take a screenshot of a URL using Firecrawl Args: url: URL to screenshot actions: Browser actions to perform before taking screenshot Returns: Dictionary with screenshot data and URL """ params = { "formats": ["screenshot"] } # Browser actions if actions: params["actions"] = actions # Perform scraping for screenshot result = await self._client.scrape_url(url, **params) # Extract screenshot data response_data = { "success": getattr(result, 'success', True), "url": url, "screenshot": getattr(result, 'screenshot', None) } return response_data ### NB!!! this is broken for now because async version of firecrawl client is not yet supporting agent. it will, so let's wait. async def browse_with_agent( self, url: str, instructions: str, formats: Optional[List[str]] = None ) -> Dict[str, Any]: """ Use FIRE-1 AI agent to browse and interact with websites Args: url: Starting URL to browse instructions: Detailed instructions for the agent on what to do formats: List of formats to return (markdown, html, etc.) Returns: Dictionary with browsing results """ # Set default formats formats = formats or ["markdown"] print(f"Scraping URL: {url} with instructions: {instructions}") print(f"Formats: {formats}") print(f"Instructions: {instructions}") # Use the service's own scrape_url method with agent result = await self.scrape_url( url, formats=formats, agent={ 'model': 'FIRE-1', 'prompt': instructions } ) # Add instructions to the response result["instructions"] = instructions return result async def crawl_url( self, url: str, formats: Optional[List[str]] = None, limit: int = 10, include_paths: Optional[List[str]] = None, exclude_paths: Optional[List[str]] = None, max_depth: Optional[int] = None, allow_backward_links: bool = False, allow_external_links: bool = False, delay: Optional[int] = None, only_main_content: bool = True, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, headers: Optional[Dict[str, str]] = None ) -> Dict[str, Any]: """ Crawl a website using Firecrawl Args: url: Starting URL to crawl formats: List of formats to return for each page limit: Maximum number of pages to crawl include_paths: Regex patterns for URLs to include exclude_paths: Regex patterns for URLs to exclude max_depth: Maximum crawl depth allow_backward_links: Whether to allow crawling backward links allow_external_links: Whether to allow crawling external links delay: Delay between requests in milliseconds only_main_content: Whether to extract only main content include_tags: HTML tags/classes/ids to include exclude_tags: HTML tags/classes/ids to exclude headers: HTTP headers to send Returns: Dictionary with crawled data """ params = { "limit": limit } # Set formats if formats: params["formats"] = formats else: params["formats"] = ["markdown"] # Crawl options if include_paths: params["includePaths"] = include_paths if exclude_paths: params["excludePaths"] = exclude_paths if max_depth is not None: params["maxDepth"] = max_depth if allow_backward_links: params["allowBackwardLinks"] = allow_backward_links if allow_external_links: params["allowExternalLinks"] = allow_external_links if delay is not None: params["delay"] = delay # Content selection params["onlyMainContent"] = only_main_content # Tag filtering if include_tags: params["includeTags"] = include_tags if exclude_tags: params["excludeTags"] = exclude_tags # Headers if headers: params["headers"] = headers # Perform crawling result = await self._client.crawl_url(url, **params) # Extract data from CrawlResponse object success = getattr(result, 'success', True) data = getattr(result, 'data', []) if hasattr(result, 'data') else [] # Build response pages = [] for page in data: page_data = {"url": page.get("url", "")} # Add available data for each page if "markdown" in page: page_data["markdown"] = page["markdown"] if "html" in page: page_data["html"] = page["html"] if "rawHtml" in page: page_data["rawHtml"] = page["rawHtml"] if "links" in page: page_data["links"] = page["links"] if "screenshot" in page: page_data["screenshot"] = page["screenshot"] if "metadata" in page: page_data["metadata"] = page["metadata"] if "extract" in page: page_data["extract"] = page["extract"] pages.append(page_data) return { "success": success, "start_url": url, "total_pages": len(pages), "crawl_limit": limit, "pages": pages } def get_firecrawl_service(api_key: Optional[str] = None) -> Optional[FirecrawlService]: """ Get a Firecrawl service instance Args: api_key: Optional API key. If not provided, will try to get from environment Returns: FirecrawlService instance or None if not available """ try: return FirecrawlService(api_key) except (ImportError, ValueError): return None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/GeorgeStrakhov/mcpeasy'

If you have feedback or need assistance with the MCP directory API, please join our Discord server