Skip to main content
Glama

Trafilatura MCP Server

by fvanevski
trafilatura_mcp.py4.52 kB
# trafilatura_mcp.py """ MCP server exposing Trafilatura tools for web scraping. This script provides an MCP server that allows you to extract main content, metadata, and comments from web pages using the Trafilatura library. It communicates over standard input/output (stdio) and is designed to be run as a standalone command-line script. Tools: - fetch_and_extract: Fetches a URL and extracts the main content. Run: python3 trafilatura_mcp.py """ from __future__ import annotations import asyncio import json import logging from typing import Any, Dict, List, Optional import trafilatura from pydantic import BaseModel, Field from mcp.server import Server from mcp.server.stdio import stdio_server from mcp.shared.exceptions import McpError from mcp.types import ( ErrorData, TextContent, Tool, INTERNAL_ERROR, INVALID_PARAMS, ) # --- Basic Logging Setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # --- Pydantic Model for Tool Input --- class TrafilaturaInput(BaseModel): """Input model for the fetch_and_extract tool.""" url: str = Field(..., description="The URL of the web page to process.") include_comments: bool = Field( default=False, description="Whether to include comment sections at the bottom of articles." ) include_tables: bool = Field( default=False, description="Extract text from HTML <table> elements." ) # --- Trafilatura Interaction --- def perform_trafilatura(args: TrafilaturaInput) -> str: """ Fetches and extracts content from a URL using Trafilatura. Args: args: A TrafilaturaInput object containing the URL and extraction options. Returns: A JSON string containing the extracted content and metadata. """ logging.info(f"Executing fetch_and_extract for URL: '{args.url}'") try: # Fetch and extract the content from the given URL downloaded = trafilatura.fetch_url(args.url) if downloaded is None: raise McpError( ErrorData( code=INTERNAL_ERROR, message=f"Failed to download content from URL: {args.url}", ) ) # Extract the main content and metadata as a JSON string json_output = trafilatura.extract( downloaded, include_comments=args.include_comments, include_tables=args.include_tables, output_format="json", with_metadata=True, url=args.url ) if json_output is None: # If trafilatura returns nothing, build a minimal JSON response return json.dumps({"main_content": None, "metadata": {}}, indent=4) return json_output except Exception as e: logging.error( f"An unexpected error occurred during Trafilatura processing: {e}", exc_info=True, ) raise McpError( ErrorData(code=INTERNAL_ERROR, message=f"Unexpected error: {e}") ) # --- MCP Server Implementation --- async def serve(): """Sets up and runs the MCP server using stdio.""" server = Server("trafilatura") @server.list_tools() async def list_tools() -> list[Tool]: return [ Tool( name="fetch_and_extract", description=( "Fetches a URL and extracts the main content, metadata, and comments. " "Returns a JSON object with the extracted data." ), inputSchema=TrafilaturaInput.model_json_schema(), ) ] @server.call_tool() async def call_tool(name: str, arguments: dict) -> list[TextContent]: if name != "fetch_and_extract": raise McpError( ErrorData(code=INVALID_PARAMS, message=f"Unknown tool: {name}") ) try: args = TrafilaturaInput(**arguments) except ValueError as e: raise McpError(ErrorData(code=INVALID_PARAMS, message=str(e))) # Perform the extraction result_json_string = perform_trafilatura(args) # Return the result as a JSON string return [TextContent(type="text", text=result_json_string)] options = server.create_initialization_options() async with stdio_server() as (read_stream, write_stream): await server.run(read_stream, write_stream, options) if __name__ == "__main__": asyncio.run(serve())

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/fvanevski/trafilatura_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server