Skip to main content
Glama

ScrapeGraph MCP Server

Official
#!/usr/bin/env python3 """ MCP server for ScapeGraph API integration. This server exposes methods to use ScapeGraph's AI-powered web scraping services: - markdownify: Convert any webpage into clean, formatted markdown - smartscraper: Extract structured data from any webpage using AI - searchscraper: Perform AI-powered web searches with structured results - smartcrawler_initiate: Initiate intelligent multi-page web crawling with AI extraction or markdown conversion - smartcrawler_fetch_results: Retrieve results from asynchronous crawling operations """ import os from typing import Any, Dict import httpx from mcp.server.fastmcp import FastMCP class ScapeGraphClient: """Client for interacting with the ScapeGraph API.""" BASE_URL = "https://api.scrapegraphai.com/v1" def __init__(self, api_key: str): """ Initialize the ScapeGraph API client. Args: api_key: API key for ScapeGraph API """ self.api_key = api_key self.headers = { "SGAI-APIKEY": api_key, "Content-Type": "application/json" } self.client = httpx.Client(timeout=60.0) def markdownify(self, website_url: str) -> Dict[str, Any]: """ Convert a webpage into clean, formatted markdown. Args: website_url: URL of the webpage to convert Returns: Dictionary containing the markdown result """ url = f"{self.BASE_URL}/markdownify" data = { "website_url": website_url } response = self.client.post(url, headers=self.headers, json=data) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" raise Exception(error_msg) return response.json() def smartscraper(self, user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None) -> Dict[str, Any]: """ Extract structured data from a webpage using AI. Args: user_prompt: Instructions for what data to extract website_url: URL of the webpage to scrape number_of_scrolls: Number of infinite scrolls to perform (optional) markdown_only: Whether to return only markdown content without AI processing (optional) Returns: Dictionary containing the extracted data or markdown content """ url = f"{self.BASE_URL}/smartscraper" data = { "user_prompt": user_prompt, "website_url": website_url } # Add number_of_scrolls to the request if provided if number_of_scrolls is not None: data["number_of_scrolls"] = number_of_scrolls # Add markdown_only to the request if provided if markdown_only is not None: data["markdown_only"] = markdown_only response = self.client.post(url, headers=self.headers, json=data) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" raise Exception(error_msg) return response.json() def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]: """ Perform AI-powered web searches with structured results. Args: user_prompt: Search query or instructions num_results: Number of websites to search (optional, default: 3 websites = 30 credits) number_of_scrolls: Number of infinite scrolls to perform on each website (optional) Returns: Dictionary containing search results and reference URLs """ url = f"{self.BASE_URL}/searchscraper" data = { "user_prompt": user_prompt } # Add num_results to the request if provided if num_results is not None: data["num_results"] = num_results # Add number_of_scrolls to the request if provided if number_of_scrolls is not None: data["number_of_scrolls"] = number_of_scrolls response = self.client.post(url, headers=self.headers, json=data) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" raise Exception(error_msg) return response.json() def smartcrawler_initiate( self, url: str, prompt: str = None, extraction_mode: str = "ai", depth: int = None, max_pages: int = None, same_domain_only: bool = None ) -> Dict[str, Any]: """ Initiate a SmartCrawler request for multi-page web crawling. SmartCrawler supports two modes: - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown Smartcrawler takes some time to process the request and returns the request id. Use smartcrawler_fetch_results to get the results of the request. You have to keep polling the smartcrawler_fetch_results until the request is complete. The request is complete when the status is "completed". Args: url: Starting URL to crawl prompt: AI prompt for data extraction (required for AI mode) extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai") depth: Maximum link traversal depth (optional) max_pages: Maximum number of pages to crawl (optional) same_domain_only: Whether to crawl only within the same domain (optional) Returns: Dictionary containing the request ID for async processing """ endpoint = f"{self.BASE_URL}/crawl" data = { "url": url } # Handle extraction mode if extraction_mode == "markdown": data["markdown_only"] = True elif extraction_mode == "ai": if prompt is None: raise ValueError("prompt is required when extraction_mode is 'ai'") data["prompt"] = prompt else: raise ValueError(f"Invalid extraction_mode: {extraction_mode}. Must be 'ai' or 'markdown'") if depth is not None: data["depth"] = depth if max_pages is not None: data["max_pages"] = max_pages if same_domain_only is not None: data["same_domain_only"] = same_domain_only response = self.client.post(endpoint, headers=self.headers, json=data) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" raise Exception(error_msg) return response.json() def smartcrawler_fetch_results(self, request_id: str) -> Dict[str, Any]: """ Fetch the results of a SmartCrawler operation. Args: request_id: The request ID returned by smartcrawler_initiate Returns: Dictionary containing the crawled data (structured extraction or markdown) and metadata about processed pages Note: It takes some time to process the request and returns the results. Meanwhile it returns the status of the request. You have to keep polling the smartcrawler_fetch_results until the request is complete. The request is complete when the status is "completed". and you get results Keep polling the smartcrawler_fetch_results until the request is complete. """ endpoint = f"{self.BASE_URL}/crawl/{request_id}" response = self.client.get(endpoint, headers=self.headers) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" raise Exception(error_msg) return response.json() def close(self) -> None: """Close the HTTP client.""" self.client.close() # Create MCP server mcp = FastMCP("ScapeGraph API MCP Server") # Default API key (will be overridden in main or by direct assignment) default_api_key = os.environ.get("SGAI_API_KEY") scrapegraph_client = ScapeGraphClient(default_api_key) if default_api_key else None # Add tool for markdownify @mcp.tool() def markdownify(website_url: str) -> Dict[str, Any]: """ Convert a webpage into clean, formatted markdown. Args: website_url: URL of the webpage to convert Returns: Dictionary containing the markdown result """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: return scrapegraph_client.markdownify(website_url) except Exception as e: return {"error": str(e)} # Add tool for smartscraper @mcp.tool() def smartscraper( user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None ) -> Dict[str, Any]: """ Extract structured data from a webpage using AI. Args: user_prompt: Instructions for what data to extract website_url: URL of the webpage to scrape number_of_scrolls: Number of infinite scrolls to perform (optional) markdown_only: Whether to return only markdown content without AI processing (optional) Returns: Dictionary containing the extracted data or markdown content """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: return scrapegraph_client.smartscraper(user_prompt, website_url, number_of_scrolls, markdown_only) except Exception as e: return {"error": str(e)} # Add tool for searchscraper @mcp.tool() def searchscraper( user_prompt: str, num_results: int = None, number_of_scrolls: int = None ) -> Dict[str, Any]: """ Perform AI-powered web searches with structured results. Args: user_prompt: Search query or instructions num_results: Number of websites to search (optional, default: 3 websites = 30 credits) number_of_scrolls: Number of infinite scrolls to perform on each website (optional) Returns: Dictionary containing search results and reference URLs """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: return scrapegraph_client.searchscraper(user_prompt, num_results, number_of_scrolls) except Exception as e: return {"error": str(e)} # Add tool for SmartCrawler initiation @mcp.tool() def smartcrawler_initiate( url: str, prompt: str = None, extraction_mode: str = "ai", depth: int = None, max_pages: int = None, same_domain_only: bool = None ) -> Dict[str, Any]: """ Initiate a SmartCrawler request for intelligent multi-page web crawling. SmartCrawler supports two modes: - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown Args: url: Starting URL to crawl prompt: AI prompt for data extraction (required for AI mode) extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai") depth: Maximum link traversal depth (optional) max_pages: Maximum number of pages to crawl (optional) same_domain_only: Whether to crawl only within the same domain (optional) Returns: Dictionary containing the request ID for async processing """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: return scrapegraph_client.smartcrawler_initiate( url=url, prompt=prompt, extraction_mode=extraction_mode, depth=depth, max_pages=max_pages, same_domain_only=same_domain_only ) except Exception as e: return {"error": str(e)} # Add tool for fetching SmartCrawler results @mcp.tool() def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]: """ Fetch the results of a SmartCrawler operation. Args: request_id: The request ID returned by smartcrawler_initiate Returns: Dictionary containing the crawled data (structured extraction or markdown) and metadata about processed pages """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: return scrapegraph_client.smartcrawler_fetch_results(request_id) except Exception as e: return {"error": str(e)} def main() -> None: """Run the ScapeGraph MCP server.""" print("Starting ScapeGraph MCP server!") # Run the server mcp.run(transport="stdio") if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ScrapeGraphAI/scrapegraph-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server