schema.py•2.57 kB
"""Pydantic models for document metadata and schema validation."""
from datetime import datetime
from typing import List, Literal, Optional
from pydantic import BaseModel, Field, ConfigDict
import uuid
class DocumentMetadata(BaseModel):
"""Rich metadata for documents stored in the knowledge base."""
# Identity
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
namespace: str = Field(
description="Hierarchical organization: 'documents/work/project-x', 'notes/personal'"
)
# Classification
content_type: Literal["document", "note", "conversation", "snippet"]
category: Optional[str] = Field(
None,
description="Broad category: 'work', 'personal', 'family'"
)
tags: List[str] = Field(default_factory=list)
# Source tracking
source: str = Field(
description="File path, URL, or 'manual'"
)
source_hash: Optional[str] = Field(
None,
description="SHA256 hash for change detection"
)
source_modified_at: Optional[datetime] = None
# Versioning & evolution
schema_version: int = Field(
default=1,
description="Schema version for future migrations"
)
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
# Chunking (for multi-chunk documents)
document_id: Optional[str] = Field(
None,
description="Parent document ID if this is a chunk"
)
chunk_index: Optional[int] = None
total_chunks: Optional[int] = None
# Rich context
title: Optional[str] = None
author: Optional[str] = None
description: Optional[str] = None
# Audit trail
ingestion_method: Literal["mcp_tool", "bulk_import", "api", "watch_folder"] = "mcp_tool"
ingested_by: Optional[str] = Field(
None,
description="User or system identifier"
)
model_config = ConfigDict(
json_encoders={
datetime: lambda v: v.isoformat()
}
)
class SearchResult(BaseModel):
"""Result from a semantic search query."""
document_id: str
chunk_id: Optional[str] = None
score: float
text: str
metadata: DocumentMetadata
model_config = ConfigDict(arbitrary_types_allowed=True)
class Document(BaseModel):
"""Full document with metadata."""
id: str
full_text: str
metadata: DocumentMetadata
chunks: Optional[List[str]] = Field(
default_factory=list,
description="List of chunk IDs if document was chunked"
)