Skip to main content
Glama
juanqui
by juanqui
models.py9.2 kB
"""Data models and schemas for the PDF Knowledgebase server.""" import hashlib from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional from uuid import uuid4 @dataclass class Chunk: """Represents a text chunk from a document.""" id: str = "" document_id: str = "" text: str = "" embedding: Optional[List[float]] = None metadata: Dict[str, Any] = field(default_factory=dict) page_number: Optional[int] = None chunk_index: int = 0 def __post_init__(self): """Set default metadata and generate deterministic ID.""" # Generate deterministic ID based on content if not self.id: self.id = self._generate_content_id() if "created_at" not in self.metadata: self.metadata["created_at"] = datetime.now(timezone.utc).isoformat() def _generate_content_id(self) -> str: """Generate a deterministic ID based on chunk content. Returns: Deterministic hash-based ID for the chunk. """ # Create a unique string from the chunk's key characteristics content_string = f"{self.document_id}|{self.chunk_index}|{self.text}|{self.page_number}" # Generate SHA-256 hash chunk_hash = hashlib.sha256(content_string.encode("utf-8")).hexdigest() # Return first 16 characters for readability (still extremely unlikely to collide) return f"chunk_{chunk_hash[:16]}" @property def has_embedding(self) -> bool: """Check if chunk has an embedding.""" return self.embedding is not None and len(self.embedding) > 0 def to_dict(self) -> Dict[str, Any]: """Convert chunk to dictionary.""" return { "id": self.id, "document_id": self.document_id, "text": self.text, "embedding": self.embedding, "metadata": self.metadata, "page_number": self.page_number, "chunk_index": self.chunk_index, } @classmethod def from_dict(cls, data: Dict[str, Any]) -> "Chunk": """Create chunk from dictionary.""" return cls( id=data.get("id", str(uuid4())), document_id=data.get("document_id", ""), text=data.get("text", ""), embedding=data.get("embedding"), metadata=data.get("metadata", {}), page_number=data.get("page_number"), chunk_index=data.get("chunk_index", 0), ) @dataclass class Document: """Represents a processed PDF document.""" id: str = "" path: str = "" title: Optional[str] = None chunks: List[Chunk] = field(default_factory=list) metadata: Dict[str, Any] = field(default_factory=dict) checksum: str = "" page_count: int = 0 chunk_count: int = 0 file_size: int = 0 added_at: Optional[datetime] = None updated_at: Optional[datetime] = None def __post_init__(self): """Set default values and generate deterministic ID.""" if self.added_at is None: self.added_at = datetime.now(timezone.utc) if self.updated_at is None: self.updated_at = self.added_at # Update chunk count self.chunk_count = len(self.chunks) # Generate deterministic ID based on file path and checksum if not self.id: self.id = self._generate_document_id() # Set default metadata if "source" not in self.metadata: self.metadata["source"] = self.path if "added_at" not in self.metadata: self.metadata["added_at"] = self.added_at.isoformat() def _generate_document_id(self) -> str: """Generate a deterministic ID based on document characteristics. Returns: Deterministic hash-based ID for the document. """ # Use file path and checksum for uniqueness content_string = f"{self.path}|{self.checksum}" # Generate SHA-256 hash doc_hash = hashlib.sha256(content_string.encode("utf-8")).hexdigest() # Return first 16 characters for readability return f"doc_{doc_hash[:16]}" @property def filename(self) -> str: """Get the filename from the path.""" return Path(self.path).name @property def has_chunks(self) -> bool: """Check if document has chunks.""" return len(self.chunks) > 0 @property def has_embeddings(self) -> bool: """Check if all chunks have embeddings.""" return all(chunk.has_embedding for chunk in self.chunks) def add_chunk(self, chunk: Chunk) -> None: """Add a chunk to the document.""" chunk.document_id = self.id self.chunks.append(chunk) self.chunk_count = len(self.chunks) self.updated_at = datetime.now(timezone.utc) def to_dict(self, include_chunks: bool = True) -> Dict[str, Any]: """Convert document to dictionary. Args: include_chunks: Whether to include chunks in the output. """ result = { "id": self.id, "path": self.path, "title": self.title, "metadata": self.metadata, "checksum": self.checksum, "page_count": self.page_count, "chunk_count": self.chunk_count, "file_size": self.file_size, "added_at": self.added_at.isoformat() if self.added_at else None, "updated_at": self.updated_at.isoformat() if self.updated_at else None, } if include_chunks: result["chunks"] = [chunk.to_dict() for chunk in self.chunks] return result @classmethod def from_dict(cls, data: Dict[str, Any]) -> "Document": """Create document from dictionary.""" # Parse datetime fields added_at = None if data.get("added_at"): added_at = datetime.fromisoformat(data["added_at"].replace("Z", "+00:00")) updated_at = None if data.get("updated_at"): updated_at = datetime.fromisoformat(data["updated_at"].replace("Z", "+00:00")) # Parse chunks chunks = [] if "chunks" in data: chunks = [Chunk.from_dict(chunk_data) for chunk_data in data["chunks"]] return cls( id=data.get("id", str(uuid4())), path=data.get("path", ""), title=data.get("title"), chunks=chunks, metadata=data.get("metadata", {}), checksum=data.get("checksum", ""), page_count=data.get("page_count", 0), chunk_count=data.get("chunk_count", 0), file_size=data.get("file_size", 0), added_at=added_at, updated_at=updated_at, ) @dataclass class SearchResult: """Represents a search result.""" chunk: Chunk score: float document: Document search_type: str = "hybrid" # Which search contributed this result vector_score: Optional[float] = None # Original vector similarity text_score: Optional[float] = None # Original BM25 score def to_dict(self) -> Dict[str, Any]: """Convert search result to dictionary.""" return { "document_id": self.document.id, "document_title": self.document.title or self.document.filename, "document_path": self.document.path, "chunk_id": self.chunk.id, "chunk_text": self.chunk.text, "page_number": self.chunk.page_number, "chunk_index": self.chunk.chunk_index, "score": self.score, "metadata": { **self.document.metadata, **self.chunk.metadata, }, } @dataclass class SearchQuery: """Represents a search query.""" query: str limit: int = 5 metadata_filter: Optional[Dict[str, Any]] = None min_score: float = 0.0 search_type: str = "hybrid" # "hybrid", "vector", "text" def __post_init__(self): """Validate query parameters.""" if not self.query.strip(): raise ValueError("Query cannot be empty") if self.limit <= 0: raise ValueError("Limit must be positive") if self.min_score < 0 or self.min_score > 1: raise ValueError("min_score must be between 0 and 1") @dataclass class ProcessingResult: """Represents the result of processing a document.""" success: bool document: Optional[Document] = None error: Optional[str] = None processing_time: float = 0.0 chunks_created: int = 0 embeddings_generated: int = 0 def to_dict(self) -> Dict[str, Any]: """Convert processing result to dictionary.""" result = { "success": self.success, "processing_time": self.processing_time, "chunks_created": self.chunks_created, "embeddings_generated": self.embeddings_generated, } if self.document: result["document"] = self.document.to_dict(include_chunks=False) if self.error: result["error"] = self.error return result

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server