Article Quadrant Analyzer MCP Server

extract_content.py•14.9 KiB

"""MCP tool for extracting content from various sources.""" import logging import time from typing import Dict, Any, Optional from mcp_server_article_quadrant.models.content import ContentSource, ContentExtractionOptions, ContentExtractionResult from mcp_server_article_quadrant.utils.content_extractor import ContentExtractor from mcp_server_article_quadrant.utils.error_handling import handle_error, ErrorContext logger = logging.getLogger(__name__) # Global content extractor instance _content_extractor: Optional[ContentExtractor] = None def get_content_extractor() -> ContentExtractor: """Get or create global content extractor instance.""" global _content_extractor if _content_extractor is None: _content_extractor = ContentExtractor() return _content_extractor async def extract_article_content( source: Dict[str, Any], options: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: """ Extract clean article content from URLs, local files, or direct text input with automatic preprocessing and metadata extraction. This tool supports multiple content sources: - URLs: Web pages, news articles, blog posts, WeChat public accounts - Files: PDF, TXT, MD, DOCX documents - Images: PNG, JPG, WEBP screenshots (with OCR) - Direct text: Manual text input Args: source: Content source specification containing: - type: Source type ("url", "file_path", "direct_text") - content: URL, file path, or direct text content - encoding: Text encoding for file processing (default: "utf-8") options: Extraction options including: - extract_metadata: Extract title, author, publication date (default: true) - clean_html: Remove HTML tags and clean content (default: true) - min_length: Minimum content length in characters (default: 100) - timeout: Timeout in seconds for URL processing (default: 30) - max_content_length: Maximum content length to process (default: 50000) - max_file_size_mb: Maximum file size in MB (default: 50) - language: Content language for OCR/analysis (default: "auto") Returns: Dictionary containing: - success: Whether extraction was successful - content: Extracted content with title, text, and metadata (if successful) - processing_time: Time taken for extraction in seconds - warnings: Processing warnings (if any) - error: Error details (if extraction failed) Example usage: # Extract from URL result = await extract_article_content({ "type": "url", "content": "https://example.com/article" }) # Extract from file result = await extract_article_content({ "type": "file_path", "content": "/path/to/document.pdf", "encoding": "utf-8" }) # Extract from image (OCR) result = await extract_article_content({ "type": "file_path", "content": "/path/to/screenshot.png" }) # Direct text input result = await extract_article_content({ "type": "direct_text", "content": "This is the article content to analyze..." }) """ start_time = time.time() options = options or {} try: with ErrorContext("extract_article_content", context={"source_type": source.get("type")}): logger.info(f"Extracting content from source type: {source.get('type')}") # Validate source if not isinstance(source, dict): return { "success": False, "error": { "type": "ValidationError", "message": "Source must be a dictionary", "details": {"received_type": type(source).__name__} }, "processing_time": time.time() - start_time } # Validate required fields source_type = source.get("type") content = source.get("content") if not source_type: return { "success": False, "error": { "type": "ValidationError", "message": "Source must include 'type' field", "details": {"missing_field": "type"} }, "processing_time": time.time() - start_time } if not content: return { "success": False, "error": { "type": "ValidationError", "message": "Source must include 'content' field", "details": {"missing_field": "content"} }, "processing_time": time.time() - start_time } # Validate source type valid_types = ["url", "file_path", "direct_text"] if source_type not in valid_types: return { "success": False, "error": { "type": "ValidationError", "message": f"Invalid source type '{source_type}'. Valid types: {valid_types}", "details": {"valid_types": valid_types, "received_type": source_type} }, "processing_time": time.time() - start_time } # Set default options default_options = { "extract_metadata": True, "clean_html": True, "min_length": 100, "timeout": 30, "max_content_length": 50000, "max_file_size_mb": 50, "encoding": "utf-8", "language": "auto" } extraction_options = {**default_options, **options} # Add source encoding to options if specified if "encoding" in source: extraction_options["encoding"] = source["encoding"] # Get content extractor and extract content extractor = get_content_extractor() result = await extractor.extract_content(source, extraction_options) # Calculate processing time processing_time = time.time() - start_time if result.get("success"): # Standardize successful response content_data = result.get("content", {}) # Ensure content has required fields if not isinstance(content_data, dict): content_data = {} standardized_response = { "success": True, "content": { "title": content_data.get("title"), "text": content_data.get("text", ""), "metadata": content_data.get("metadata", {}), "sections": content_data.get("sections"), "links": content_data.get("links") }, "processing_time": processing_time, "warnings": result.get("warnings", []) } # Add metadata about extraction if standardized_response["content"]["metadata"]: standardized_response["content"]["metadata"]["extraction_timestamp"] = time.time() standardized_response["content"]["metadata"]["source_type"] = source_type standardized_response["content"]["metadata"]["processing_options"] = extraction_options logger.info(f"Successfully extracted content from {source_type} source in {processing_time:.2f}s") return standardized_response else: # Standardize error response error_data = result.get("error", {}) if isinstance(error_data, dict): error_data.update({ "processing_time": processing_time, "source_type": source_type }) return { "success": False, "error": error_data, "processing_time": processing_time } except Exception as e: logger.error(f"Unexpected error in extract_article_content: {e}") processing_time = time.time() - start_time return handle_error( e, context={ "source": source, "options": options, "processing_time": processing_time }, logger=logger ) # Tool metadata for FastMCP TOOL_METADATA = { "name": "extract_article_content", "description": """ Extract clean article content from URLs, local files, or direct text input with automatic preprocessing and metadata extraction. Supports multiple content sources: - URLs: Web pages, news articles, blog posts, WeChat public accounts - Files: PDF, TXT, MD, DOCX documents - Images: PNG, JPG, WEBP screenshots (with OCR) - Direct text: Manual text input Features: - Automatic content cleaning and preprocessing - Metadata extraction (title, author, publication date) - OCR support for images and scanned documents - Multi-language support (English, Chinese, etc.) - Configurable processing limits and timeouts """, "annotations": { "readOnlyHint": False, # May modify content through cleaning "destructiveHint": False, # Content extraction is non-destructive "idempotentHint": True, # Same input produces same output "openWorldHint": True # External URLs introduce open-world assumptions }, "input_schema": { "type": "object", "properties": { "source": { "type": "object", "description": "Content source specification", "properties": { "type": { "type": "string", "enum": ["url", "file_path", "direct_text"], "description": "Type of content source", "examples": ["url", "file_path", "direct_text"] }, "content": { "type": "string", "description": "URL, file path, or direct text content", "examples": [ "https://example.com/article", "/path/to/document.pdf", "This is the article content to analyze..." ] }, "encoding": { "type": "string", "default": "utf-8", "description": "Text encoding for file processing", "examples": ["utf-8", "gbk", "latin-1"] } }, "required": ["type", "content"] }, "options": { "type": "object", "description": "Extraction options", "properties": { "extract_metadata": { "type": "boolean", "default": True, "description": "Extract title, author, publication date" }, "clean_html": { "type": "boolean", "default": True, "description": "Remove HTML tags and clean content" }, "min_length": { "type": "integer", "default": 100, "minimum": 10, "maximum": 1000, "description": "Minimum content length in characters" }, "timeout": { "type": "integer", "default": 30, "minimum": 5, "maximum": 300, "description": "Timeout in seconds for URL processing" }, "max_content_length": { "type": "integer", "default": 50000, "minimum": 1000, "maximum": 200000, "description": "Maximum content length to process" }, "max_file_size_mb": { "type": "integer", "default": 50, "minimum": 1, "maximum": 200, "description": "Maximum file size in MB" }, "language": { "type": "string", "default": "auto", "enum": ["auto", "en", "zh", "es", "fr", "de", "ja"], "description": "Content language for OCR/analysis" } } } }, "required": ["source"] }, "examples": [ { "description": "Extract content from a news article URL", "input": { "source": { "type": "url", "content": "https://www.bbc.com/news/technology-123456" }, "options": { "timeout": 30, "max_content_length": 10000 } } }, { "description": "Extract text from a PDF document", "input": { "source": { "type": "file_path", "content": "/path/to/research.pdf" }, "options": { "max_file_size_mb": 20, "extract_metadata": True } } }, { "description": "Extract text from a screenshot using OCR", "input": { "source": { "type": "file_path", "content": "/path/to/screenshot.png" }, "options": { "language": "auto" } } }, { "description": "Process direct text input", "input": { "source": { "type": "direct_text", "content": """ Artificial intelligence is transforming healthcare by enabling more accurate diagnoses, personalized treatment plans, and predictive analytics. Machine learning algorithms can analyze medical images, detect patterns in patient data, and assist in drug discovery. These technologies have the potential to improve patient outcomes while reducing costs. """.strip() }, "options": { "extract_metadata": True } } } ] }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vincent623/-Article-Quadrant-Analyzer-MCP-Server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extract_content.py•14.9 KiB