"""SQLAlchemy models for documents and chunks."""
from datetime import datetime
from typing import Any
from uuid import UUID
from pgvector.sqlalchemy import Vector
from sqlalchemy import DateTime, ForeignKey, Integer, String, Text, func
from sqlalchemy.dialects.postgresql import JSONB, UUID as PGUUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
class Base(DeclarativeBase):
"""Base class for all models."""
pass
class Document(Base):
"""Document model representing an ingested file."""
__tablename__ = "documents"
id: Mapped[UUID] = mapped_column(
PGUUID(as_uuid=True),
primary_key=True,
server_default=func.gen_random_uuid(),
)
name: Mapped[str] = mapped_column(String(255), nullable=False)
source_path: Mapped[str] = mapped_column(Text, nullable=False)
file_type: Mapped[str] = mapped_column(String(20), nullable=False)
file_size: Mapped[int | None] = mapped_column(Integer, nullable=True)
chunk_count: Mapped[int] = mapped_column(Integer, default=0)
collection: Mapped[str] = mapped_column(String(50), default="default", index=True)
content_hash: Mapped[str | None] = mapped_column(String(64), nullable=True)
doc_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, default=dict
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
)
chunks: Mapped[list["Chunk"]] = relationship(
"Chunk",
back_populates="document",
cascade="all, delete-orphan",
)
def __repr__(self) -> str:
return f"<Document(id={self.id}, name='{self.name}', type='{self.file_type}')>"
class Chunk(Base):
"""Chunk model representing a text chunk with embedding."""
__tablename__ = "chunks"
id: Mapped[UUID] = mapped_column(
PGUUID(as_uuid=True),
primary_key=True,
server_default=func.gen_random_uuid(),
)
document_id: Mapped[UUID] = mapped_column(
PGUUID(as_uuid=True),
ForeignKey("documents.id", ondelete="CASCADE"),
nullable=False,
)
content: Mapped[str] = mapped_column(Text, nullable=False)
embedding: Mapped[list[float] | None] = mapped_column(Vector(768), nullable=True)
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
page_number: Mapped[int | None] = mapped_column(Integer, nullable=True)
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
doc_metadata: Mapped[dict[str, Any]] = mapped_column(
"metadata", JSONB, default=dict
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
)
document: Mapped["Document"] = relationship("Document", back_populates="chunks")
def __repr__(self) -> str:
return f"<Chunk(id={self.id}, doc_id={self.document_id}, index={self.chunk_index})>"