Skip to main content
Glama
juanqui
by juanqui
summarizer_remote.py11.1 kB
"""Remote LLM-based document summarization service using OpenAI-compatible APIs.""" import json import logging import re from typing import Dict from .config import ServerConfig from .exceptions import EmbeddingError from .summarizer_base import DocumentSummary, SummarizerService logger = logging.getLogger(__name__) class RemoteSummarizerService(SummarizerService): """Remote summarization service using OpenAI-compatible APIs.""" def __init__(self, config: ServerConfig): """Initialize the remote summarizer service. Args: config: Server configuration. """ self.config = config self.api_key = config.summarizer_api_key or config.openai_api_key self.api_base = config.summarizer_api_base or config.openai_api_base self.model_name = config.summarizer_model self.max_pages = config.summarizer_max_pages self._initialized = False # OpenAI client will be initialized in initialize() self.client = None async def initialize(self) -> None: """Initialize the remote summarizer service.""" if self._initialized: return if not self.api_key or self.api_key == "sk-local-embeddings-dummy-key": raise EmbeddingError( "OpenAI API key required for remote summarizer. Set PDFKB_OPENAI_API_KEY or PDFKB_SUMMARIZER_API_KEY", self.model_name, ) try: from openai import AsyncOpenAI # Initialize OpenAI client with custom base URL if provided client_kwargs = {"api_key": self.api_key} if self.api_base: client_kwargs["base_url"] = self.api_base logger.info(f"Using custom API base: {self.api_base}") self.client = AsyncOpenAI(**client_kwargs) logger.info(f"Remote summarizer service initialized with model: {self.model_name}") self._initialized = True except ImportError as e: raise EmbeddingError( f"OpenAI package not installed. Install with: pip install openai: {e}", self.model_name, ) except Exception as e: raise EmbeddingError(f"Failed to initialize remote summarizer service: {e}", self.model_name, e) def _create_summarization_prompt(self, content: str, filename: str = "") -> str: """Create a comprehensive prompt for document summarization. Args: content: Document content to summarize. filename: Optional filename for context. Returns: Formatted prompt for the LLM. """ filename_context = f" The document filename is: {filename}." if filename else "" return ( f"You are an expert document analyst. Your task is to analyze the provided document " f"and create a comprehensive summary with three components: a title, a short description, " f"and a long description.{filename_context}\n\n" f"Please analyze the following document content and provide:\n\n" f"1. **Title**: A clear, descriptive title that captures the main subject/purpose (max 80 characters)\n" f"2. **Short Description**: A concise 1-2 sentence summary highlighting " f"the key topic and purpose (max 200 characters)\n" f"3. **Long Description**: A detailed paragraph explaining the document's content, " f"key points, methodology, findings, or conclusions (max 500 characters)\n\n" f"**Important**: Return your response as a valid JSON object with exactly these keys: " f'"title", "short_description", "long_description". Do not include any other text outside the JSON.\n\n' f"Document content:\n{content}" ) def _truncate_content(self, content: str, max_tokens: int = 30000) -> str: """Truncate content to fit within the API's context window. Args: content: Original document content. max_tokens: Maximum tokens to allow for content. Returns: Truncated content that fits within the model's context. """ # Rough estimation: 4 characters per token max_content_chars = max_tokens * 4 if len(content) <= max_content_chars: return content # Truncate and add indicator truncated = content[:max_content_chars] # Try to cut at a sentence boundary last_period = truncated.rfind(".") if last_period > max_content_chars * 0.8: # If we can find a period in the last 20% truncated = truncated[: last_period + 1] return truncated + "\n\n[Content truncated due to length...]" async def summarize_document(self, content: str, filename: str = "") -> DocumentSummary: """Summarize a document using the remote LLM. Args: content: The document content to summarize. filename: Optional filename for context. Returns: DocumentSummary with title, short description, and long description. """ if not content or not content.strip(): raise ValueError("Content cannot be empty") if not self._initialized: await self.initialize() try: # Truncate content if necessary truncated_content = self._truncate_content(content) # Create prompt prompt = self._create_summarization_prompt(truncated_content, filename) # Make API call response = await self.client.chat.completions.create( model=self.model_name, messages=[ { "role": "system", "content": ( "You are an expert document analyst. Always respond with valid JSON " "containing title, short_description, and long_description keys." ), }, {"role": "user", "content": prompt}, ], temperature=0.3, max_tokens=1024, ) # Extract and parse response response_content = response.choices[0].message.content try: return self._parse_summary_response(response_content, filename) except Exception as e: logger.warning(f"Failed to parse API response, using fallback: {e}") return self._create_fallback_summary(content, filename) except Exception as e: logger.error(f"Failed to summarize document with remote API: {e}") # Return a basic fallback summary title = filename if filename else "Document" return DocumentSummary( title=title, short_description="Document summary unavailable due to API error", long_description=( "This document could not be automatically summarized due to an API error. " f"Original content length: {len(content)} characters." ), ) def _parse_summary_response(self, response: str, filename: str = "") -> DocumentSummary: """Parse the API's JSON response into a DocumentSummary. Args: response: The API's response string. filename: Optional filename for fallback. Returns: Parsed DocumentSummary. """ # Clean the response - extract JSON if wrapped in other text response = response.strip() # Try to find JSON object json_match = re.search(r"\{.*\}", response, re.DOTALL) if json_match: json_str = json_match.group(0) else: json_str = response try: parsed = json.loads(json_str) # Validate required fields title = parsed.get("title", "").strip() short_desc = parsed.get("short_description", "").strip() long_desc = parsed.get("long_description", "").strip() if not title or not short_desc or not long_desc: raise ValueError("Missing required fields in response") # Truncate if necessary title = title[:80] if len(title) > 80 else title short_desc = short_desc[:200] if len(short_desc) > 200 else short_desc long_desc = long_desc[:500] if len(long_desc) > 500 else long_desc return DocumentSummary( title=title, short_description=short_desc, long_description=long_desc, ) except (json.JSONDecodeError, KeyError, ValueError) as e: logger.warning(f"Failed to parse JSON response: {e}") raise def _create_fallback_summary(self, content: str, filename: str = "") -> DocumentSummary: """Create a basic fallback summary when API parsing fails. Args: content: Original document content. filename: Optional filename. Returns: Basic DocumentSummary. """ # Use filename as title if available, otherwise generic title = filename.replace(".pdf", "").replace("_", " ").title() if filename else "Document" # Create basic descriptions word_count = len(content.split()) char_count = len(content) short_desc = f"Document with {word_count} words" long_desc = ( f"This document contains {word_count} words and {char_count} characters. " f"Automatic summarization was not available, but the document appears to contain " f"structured content suitable for analysis." ) return DocumentSummary( title=title, short_description=short_desc, long_description=long_desc, ) async def test_connection(self) -> bool: """Test the remote summarizer service. Returns: True if service is working, False otherwise. """ try: if not self._initialized: await self.initialize() # Test with simple content test_summary = await self.summarize_document( "This is a test document for validating the summarization service.", "test.pdf" ) return bool(test_summary.title and test_summary.short_description and test_summary.long_description) except Exception as e: logger.error(f"Remote summarizer service test failed: {e}") return False def get_model_info(self) -> Dict: """Get information about the current summarizer model. Returns: Dictionary with model information. """ return { "provider": "remote", "model": self.model_name, "api_base": self.api_base or "https://api.openai.com/v1", "description": "Remote LLM via OpenAI-compatible API", "max_pages": self.max_pages, }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server