Archive Agent

pdf_pymupdf.py•3.21 KiB

# Copyright © 2025 Dr.-Ing. Paul Wilhelm <paul@wilhelm.dev> # This file is part of Archive Agent. See LICENSE for details. from typing import List, Iterator, Dict # noinspection PyPackageRequirements import fitz from archive_agent.data.loader.PdfDocument import PdfDocument, PdfPage class PyMuPdfDocument(PdfDocument): """ PyMuPDF implementation of PDF document interface. """ def __init__(self, file_path: str): """ Initialize PyMuPDF document. :param file_path: Path to PDF file. """ self._doc: fitz.Document = fitz.open(file_path) def __iter__(self) -> Iterator[PdfPage]: """ Iterate over pages in the document. :return: Iterator of PDF pages. """ for page in self._doc: yield PyMuPdfPage(page) class PyMuPdfPage(PdfPage): """ PyMuPDF implementation of PDF page interface. """ def __init__(self, page: fitz.Page): """ Initialize PyMuPDF page. :param page: PyMuPDF page object. """ self._page: fitz.Page = page def get_text(self) -> str: """ Extract text content from the page. :return: Text content. """ return self._page.get_text("text").strip() # type: ignore def get_image_bytes(self) -> List[bytes]: """ Extract image bytes from the page. :return: List of image bytes. """ blocks = self._page.get_text("dict")["blocks"] # type: ignore image_bytes = [] for block in blocks: block_type = block.get("type", "other") if block_type == 1: # Image block img = block.get("image") if img: image_bytes.append(img) return image_bytes def get_counts(self) -> Dict[str, int]: """ Get counts of different block types for logging. :return: Dictionary with keys: text_blocks, image_blocks, vector_blocks, background_images. """ blocks = self._page.get_text("dict")["blocks"] # type: ignore image_objects = self._page.get_images(full=True) text_blocks = 0 image_blocks = 0 vector_blocks = 0 for block in blocks: block_type = block.get("type", "other") if block_type == 0: text_blocks += 1 elif block_type == 1: image_blocks += 1 elif block_type == 2: vector_blocks += 1 background_images = len(image_objects) - image_blocks return { "text_blocks": text_blocks, "image_blocks": image_blocks, "vector_blocks": vector_blocks, "background_images": background_images } def get_pixmap(self, dpi: int) -> bytes: """ Render page as pixmap and return bytes. :param dpi: DPI for rendering. :return: Pixmap bytes. """ return self._page.get_pixmap(dpi=dpi).tobytes() # type: ignore def create_pdf_document(file_path: str) -> PdfDocument: """ Factory function to create a PDF document instance. :param file_path: Path to PDF file. :return: PDF document instance. """ return PyMuPdfDocument(file_path)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shredEngineer/Archive-Agent'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf_pymupdf.py•3.21 KiB