Skip to main content
Glama
juanqui
by juanqui
chunker_page.py10 kB
"""Page-based chunker that creates chunks on page boundaries.""" import re from typing import Any, Dict, List, Optional from pdfkb.chunker.chunker import Chunker from pdfkb.models import Chunk from pdfkb.parsers.parser import PageContent class PageChunker(Chunker): """Chunker that creates chunks based on page boundaries. This chunker is designed to work with page-aware parsers that output PageContent objects. Each page becomes a separate chunk, preserving the natural document structure. """ def __init__( self, min_chunk_size: Optional[int] = None, max_chunk_size: Optional[int] = None, merge_small: bool = True, global_min_chunk_size: int = 0, cache_dir: str = None, ): """Initialize the page chunker. Args: min_chunk_size: Minimum size for a chunk. Small pages may be merged. max_chunk_size: Maximum size for a chunk. Large pages may be split. merge_small: Whether to merge small consecutive pages. global_min_chunk_size: Global minimum chunk size (filters out unmergeable small chunks). cache_dir: Optional cache directory. """ super().__init__(cache_dir=cache_dir, min_chunk_size=global_min_chunk_size) self.page_min_chunk_size = min_chunk_size # Rename to avoid confusion self.max_chunk_size = max_chunk_size self.merge_small = merge_small def chunk(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> List[Chunk]: """Fallback method for non-page-aware content. This method is provided for compatibility but should not typically be used. PageChunker is designed to work with page-aware content via chunk_pages. Args: text: The text to chunk (entire document as string) metadata: Optional metadata for the document Returns: Single chunk containing the entire text """ chunks = [ Chunk( text=text, metadata={ **(metadata or {}), "chunk_strategy": "page", "chunk_index": 0, "total_chunks": 1, "warning": "PageChunker used without page-aware content", }, chunk_index=0, document_id="", ) ] # Apply global minimum chunk size filtering return self._filter_small_chunks(chunks) def chunk_pages(self, pages: List[PageContent], metadata: Optional[Dict[str, Any]] = None) -> List[Chunk]: """Chunk page-aware content by creating one chunk per page. Args: pages: List of PageContent objects from a page-aware parser metadata: Optional document-level metadata Returns: List of chunks, typically one per page (may merge small pages) """ if not pages: return [] chunks = [] current_chunk_text = "" current_chunk_pages = [] for page in pages: page_text = page.markdown_content # Remove page boundary marker from the beginning of the content # This handles markers like --[PAGE: 1]-- at the start of the page page_marker_pattern = r"^--\[PAGE:\s*\d+\]--\s*\n?" page_text = re.sub(page_marker_pattern, "", page_text, count=1) # Also check if page has a marker in metadata and clean it if page.metadata.get("has_page_marker") and page.metadata.get("page_marker"): marker = page.metadata["page_marker"] if page_text.startswith(marker): page_text = page_text[len(marker) :].lstrip() page_size = len(page_text) # Check if we should merge this page with the current chunk should_merge = False if self.merge_small and self.page_min_chunk_size and current_chunk_text: current_size = len(current_chunk_text) combined_size = current_size + page_size + 2 # +2 for "\n\n" # Only merge if: # 1. Current chunk is below minimum size AND # 2. The page itself is also small (below minimum) AND # 3. Combined size won't exceed max (if set) if current_size < self.page_min_chunk_size and page_size < self.page_min_chunk_size: if not self.max_chunk_size or combined_size <= self.max_chunk_size: should_merge = True if should_merge: # Merge with current chunk if page_text.strip(): # Only add if there's actual content current_chunk_text += "\n\n" + page_text current_chunk_pages.append(page.page_number) else: # Finalize current chunk if it exists if current_chunk_text: chunks.append(self._create_chunk(current_chunk_text, current_chunk_pages, len(chunks), metadata)) # Start new chunk with current page current_chunk_text = page_text current_chunk_pages = [page.page_number] # Don't forget the last chunk if current_chunk_text: chunks.append(self._create_chunk(current_chunk_text, current_chunk_pages, len(chunks), metadata)) # Handle large chunks if max_chunk_size is set if self.max_chunk_size: chunks = self._split_large_chunks(chunks) # Apply global minimum chunk size filtering chunks = self._filter_small_chunks(chunks) # Update total chunks count total_chunks = len(chunks) for i, chunk in enumerate(chunks): chunk.metadata["total_chunks"] = total_chunks chunk.chunk_index = i return chunks def _create_chunk( self, text: str, page_numbers: List[int], chunk_index: int, metadata: Optional[Dict[str, Any]] = None ) -> Chunk: """Create a chunk with appropriate metadata. Args: text: The chunk text page_numbers: List of page numbers included in this chunk chunk_index: Index of this chunk metadata: Optional document-level metadata Returns: A Chunk object with page metadata """ chunk_metadata = { **(metadata or {}), "chunk_strategy": "page", "chunk_index": chunk_index, "page_numbers": page_numbers, "page_count": len(page_numbers), } # Add specific page metadata if len(page_numbers) == 1: chunk_metadata["page_number"] = page_numbers[0] chunk_metadata["single_page"] = True else: chunk_metadata["page_range"] = f"{page_numbers[0]}-{page_numbers[-1]}" chunk_metadata["merged_pages"] = True return Chunk( text=text, metadata=chunk_metadata, chunk_index=chunk_index, document_id="", ) def _split_large_chunks(self, chunks: List[Chunk]) -> List[Chunk]: """Split chunks that exceed max_chunk_size. Args: chunks: List of chunks to potentially split Returns: List of chunks with large ones split """ if not self.max_chunk_size: return chunks result = [] for chunk in chunks: if len(chunk.text) <= self.max_chunk_size: result.append(chunk) else: # Split by sentences or paragraphs parts = self._split_text(chunk.text, self.max_chunk_size) for i, part in enumerate(parts): new_chunk = Chunk( text=part, metadata={ **chunk.metadata, "split_from_large_page": True, "split_part": i + 1, "split_total": len(parts), }, chunk_index=0, # Will be updated later document_id=chunk.document_id, ) result.append(new_chunk) return result def _split_text(self, text: str, max_size: int) -> List[str]: """Split text into parts not exceeding max_size. Tries to split on paragraph boundaries first, then sentences. Args: text: Text to split max_size: Maximum size for each part Returns: List of text parts """ # Try splitting by double newlines (paragraphs) paragraphs = text.split("\n\n") parts = [] current_part = "" for para in paragraphs: if len(current_part) + len(para) + 2 <= max_size: if current_part: current_part += "\n\n" + para else: current_part = para else: if current_part: parts.append(current_part) # If paragraph itself is too large, split it further if len(para) > max_size: # Simple sentence split (crude but functional) sentences = para.replace(". ", ".|").split("|") para_part = "" for sent in sentences: if len(para_part) + len(sent) <= max_size: para_part += sent else: if para_part: parts.append(para_part) para_part = sent if para_part: parts.append(para_part) else: current_part = para if current_part: parts.append(current_part) return parts

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server