PDF Knowledgebase MCP Server

summarizer_remote.py•10.9 KiB

"""Remote LLM-based document summarization service using OpenAI-compatible APIs.""" import json import logging import re from typing import Dict from .config import ServerConfig from .exceptions import EmbeddingError from .summarizer_base import DocumentSummary, SummarizerService logger = logging.getLogger(__name__) class RemoteSummarizerService(SummarizerService): """Remote summarization service using OpenAI-compatible APIs.""" def __init__(self, config: ServerConfig): """Initialize the remote summarizer service. Args: config: Server configuration. """ self.config = config self.api_key = config.summarizer_api_key or config.openai_api_key self.api_base = config.summarizer_api_base or config.openai_api_base self.model_name = config.summarizer_model self.max_pages = config.summarizer_max_pages self._initialized = False # OpenAI client will be initialized in initialize() self.client = None async def initialize(self) -> None: """Initialize the remote summarizer service.""" if self._initialized: return if not self.api_key or self.api_key == "sk-local-embeddings-dummy-key": raise EmbeddingError( "OpenAI API key required for remote summarizer. Set PDFKB_OPENAI_API_KEY or PDFKB_SUMMARIZER_API_KEY", self.model_name, ) try: from openai import AsyncOpenAI # Initialize OpenAI client with custom base URL if provided client_kwargs = {"api_key": self.api_key} if self.api_base: client_kwargs["base_url"] = self.api_base logger.info(f"Using custom API base: {self.api_base}") self.client = AsyncOpenAI(**client_kwargs) logger.info(f"Remote summarizer service initialized with model: {self.model_name}") self._initialized = True except ImportError as e: raise EmbeddingError( f"OpenAI package not installed. Install with: pip install openai: {e}", self.model_name, ) except Exception as e: raise EmbeddingError(f"Failed to initialize remote summarizer service: {e}", self.model_name, e) def _create_summarization_prompt(self, content: str, filename: str = "") -> str: """Create a comprehensive prompt for document summarization. Args: content: Document content to summarize. filename: Optional filename for context. Returns: Formatted prompt for the LLM. """ filename_context = f" The document filename is: {filename}." if filename else "" return ( f"You are an expert document analyst. Your task is to analyze the provided document " f"and create a comprehensive summary with three components: a title, a short description, " f"and a long description.{filename_context}\n\n" f"Please analyze the following document content and provide:\n\n" f"1. **Title**: A clear, descriptive title that captures the main subject/purpose (max 80 characters)\n" f"2. **Short Description**: A concise 1-2 sentence summary highlighting " f"the key topic and purpose (max 200 characters)\n" f"3. **Long Description**: A detailed paragraph explaining the document's content, " f"key points, methodology, findings, or conclusions (max 500 characters)\n\n" f"**Important**: Return your response as a valid JSON object with exactly these keys: " f'"title", "short_description", "long_description". Do not include any other text outside the JSON.\n\n' f"Document content:\n{content}" ) def _truncate_content(self, content: str, max_tokens: int = 30000) -> str: """Truncate content to fit within the API's context window. Args: content: Original document content. max_tokens: Maximum tokens to allow for content. Returns: Truncated content that fits within the model's context. """ # Rough estimation: 4 characters per token max_content_chars = max_tokens * 4 if len(content) <= max_content_chars: return content # Truncate and add indicator truncated = content[:max_content_chars] # Try to cut at a sentence boundary last_period = truncated.rfind(".") if last_period > max_content_chars * 0.8: # If we can find a period in the last 20% truncated = truncated[: last_period + 1] return truncated + "\n\n[Content truncated due to length...]" async def summarize_document(self, content: str, filename: str = "") -> DocumentSummary: """Summarize a document using the remote LLM. Args: content: The document content to summarize. filename: Optional filename for context. Returns: DocumentSummary with title, short description, and long description. """ if not content or not content.strip(): raise ValueError("Content cannot be empty") if not self._initialized: await self.initialize() try: # Truncate content if necessary truncated_content = self._truncate_content(content) # Create prompt prompt = self._create_summarization_prompt(truncated_content, filename) # Make API call response = await self.client.chat.completions.create( model=self.model_name, messages=[ { "role": "system", "content": ( "You are an expert document analyst. Always respond with valid JSON " "containing title, short_description, and long_description keys." ), }, {"role": "user", "content": prompt}, ], temperature=0.3, max_tokens=1024, ) # Extract and parse response response_content = response.choices[0].message.content try: return self._parse_summary_response(response_content, filename) except Exception as e: logger.warning(f"Failed to parse API response, using fallback: {e}") return self._create_fallback_summary(content, filename) except Exception as e: logger.error(f"Failed to summarize document with remote API: {e}") # Return a basic fallback summary title = filename if filename else "Document" return DocumentSummary( title=title, short_description="Document summary unavailable due to API error", long_description=( "This document could not be automatically summarized due to an API error. " f"Original content length: {len(content)} characters." ), ) def _parse_summary_response(self, response: str, filename: str = "") -> DocumentSummary: """Parse the API's JSON response into a DocumentSummary. Args: response: The API's response string. filename: Optional filename for fallback. Returns: Parsed DocumentSummary. """ # Clean the response - extract JSON if wrapped in other text response = response.strip() # Try to find JSON object json_match = re.search(r"\{.*\}", response, re.DOTALL) if json_match: json_str = json_match.group(0) else: json_str = response try: parsed = json.loads(json_str) # Validate required fields title = parsed.get("title", "").strip() short_desc = parsed.get("short_description", "").strip() long_desc = parsed.get("long_description", "").strip() if not title or not short_desc or not long_desc: raise ValueError("Missing required fields in response") # Truncate if necessary title = title[:80] if len(title) > 80 else title short_desc = short_desc[:200] if len(short_desc) > 200 else short_desc long_desc = long_desc[:500] if len(long_desc) > 500 else long_desc return DocumentSummary( title=title, short_description=short_desc, long_description=long_desc, ) except (json.JSONDecodeError, KeyError, ValueError) as e: logger.warning(f"Failed to parse JSON response: {e}") raise def _create_fallback_summary(self, content: str, filename: str = "") -> DocumentSummary: """Create a basic fallback summary when API parsing fails. Args: content: Original document content. filename: Optional filename. Returns: Basic DocumentSummary. """ # Use filename as title if available, otherwise generic title = filename.replace(".pdf", "").replace("_", " ").title() if filename else "Document" # Create basic descriptions word_count = len(content.split()) char_count = len(content) short_desc = f"Document with {word_count} words" long_desc = ( f"This document contains {word_count} words and {char_count} characters. " f"Automatic summarization was not available, but the document appears to contain " f"structured content suitable for analysis." ) return DocumentSummary( title=title, short_description=short_desc, long_description=long_desc, ) async def test_connection(self) -> bool: """Test the remote summarizer service. Returns: True if service is working, False otherwise. """ try: if not self._initialized: await self.initialize() # Test with simple content test_summary = await self.summarize_document( "This is a test document for validating the summarization service.", "test.pdf" ) return bool(test_summary.title and test_summary.short_description and test_summary.long_description) except Exception as e: logger.error(f"Remote summarizer service test failed: {e}") return False def get_model_info(self) -> Dict: """Get information about the current summarizer model. Returns: Dictionary with model information. """ return { "provider": "remote", "model": self.model_name, "api_base": self.api_base or "https://api.openai.com/v1", "description": "Remote LLM via OpenAI-compatible API", "max_pages": self.max_pages, }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

summarizer_remote.py•10.9 KiB