Skip to main content
Glama
DocumentContent.py3.98 kB
# Copyright © 2025 Dr.-Ing. Paul Wilhelm <paul@wilhelm.dev> # This file is part of Archive Agent. See LICENSE for details. import json from dataclasses import dataclass, field from typing import Optional, List from archive_agent.util.text_util import splitlines_exact ReferenceList = List[int] @dataclass class DocumentContent: """ Document content with pages (for page-based documents, e.g. `.pdf`) or lines (for line-based documents, e.g. `.txt`) info. """ # Both `text` and `lines` are automatically synced whenever one of them is set. _text: str = field(init=False, repr=False) _lines: List[str] = field(init=False, repr=False) lines_per_line: Optional[ReferenceList] = None # Line-based: Absolute line number for each line of `text`. pages_per_line: Optional[ReferenceList] = None # Page-based: Absolute page number for each line of `text`. @property def text(self) -> str: return self._text @text.setter def text(self, text: str) -> None: self._text = text self._lines = splitlines_exact(text) @property def lines(self) -> List[str]: return self._lines @lines.setter def lines(self, lines: List[str]) -> None: self._lines = lines self._text = "\n".join(lines) def validate(self): if self.lines_per_line is None and self.pages_per_line is None: raise ValueError("Require exactly one of `pages_per_line` or `lines_per_line`") if self.lines_per_line is not None and self.pages_per_line is not None: raise ValueError("Only one of `pages_per_line` or `lines_per_line` can be set") # Line-based: Each (relative) text line must correspond to *some* (absolute) document line… if self.lines_per_line is not None and len(self.lines_per_line) != len(self.lines): raise ValueError( f"`lines_per_line` length must match text lines:\n" f"text={json.dumps(self.lines, indent=2, default=str)}\n" f"lines_per_line={json.dumps(self.lines_per_line, indent=2, default=str)}\n" ) # Page-based: Each text line must have been extracted from *some* document page… if self.pages_per_line is not None and len(self.pages_per_line) != len(self.lines): raise ValueError( f"`pages_per_line` length must match text lines:\n" f"text={json.dumps(self.lines, indent=2, default=str)}\n" f"pages_per_line={json.dumps(self.pages_per_line, indent=2, default=str)}\n" ) @classmethod def from_lines( cls, lines: List[str], *, lines_per_line: Optional[ReferenceList] = None, pages_per_line: Optional[ReferenceList] = None, ) -> "DocumentContent": obj = cls(lines_per_line=lines_per_line, pages_per_line=pages_per_line) obj.lines = lines # Uses setter: updates _text too obj.validate() return obj @classmethod def from_text( cls, text: str, *, lines_per_line: Optional[ReferenceList] = None, pages_per_line: Optional[ReferenceList] = None, ) -> "DocumentContent": obj = cls(lines_per_line=lines_per_line, pages_per_line=pages_per_line) obj.text = text # Uses setter: updates _lines too obj.validate() return obj def strip_lines(self): """ Remove whitespace on each line. """ self.lines = [line.strip() for line in self.lines] def get_per_line_references(self) -> ReferenceList: """ Get per-line page or line references. """ is_page_based = self.pages_per_line is not None if is_page_based: per_line_references = self.pages_per_line else: per_line_references = self.lines_per_line assert per_line_references is not None, "Missing references (WTF, please report)" return per_line_references

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shredEngineer/Archive-Agent'

If you have feedback or need assistance with the MCP directory API, please join our Discord server