Skip to main content
Glama
juanqui
by juanqui
chunker.py3.47 kB
"""Abstract base class for text chunkers.""" import logging from abc import ABC, abstractmethod from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List from ..models import Chunk if TYPE_CHECKING: from ..parsers.parser import PageContent logger = logging.getLogger(__name__) @dataclass class ChunkResult: """Result of chunking a markdown document.""" chunks: List[Chunk] metadata: Dict[str, Any] class Chunker(ABC): """Abstract base class for text chunkers.""" def __init__(self, cache_dir: str = None, min_chunk_size: int = 0): """Initialize the chunker with optional cache directory and minimum chunk size. Args: cache_dir: Directory to cache chunked results. min_chunk_size: Minimum size for chunks (0 = disabled). """ self.cache_dir = cache_dir self.min_chunk_size = min_chunk_size @abstractmethod def chunk(self, markdown_content: str, metadata: Dict[str, Any]) -> List[Chunk]: """Chunk markdown content into smaller pieces. Args: markdown_content: Markdown text to chunk. metadata: Document metadata. Returns: List of Chunk objects. """ pass def chunk_pages(self, pages: List["PageContent"], metadata: Dict[str, Any]) -> List[Chunk]: """Chunk page-aware content into smaller pieces. Default implementation combines pages and uses regular chunking. Subclasses can override for page-aware chunking strategies. Args: pages: List of PageContent objects with per-page content. metadata: Document-level metadata. Returns: List of Chunk objects with page metadata preserved. """ # Default implementation: combine pages with page headers combined_parts = [] for page in pages: combined_parts.append(f"# Page {page.page_number}\n") combined_parts.append(page.markdown_content) combined_markdown = "\n\n".join(combined_parts) # Use regular chunking on combined content chunks = self.chunk(combined_markdown, metadata) # Try to enrich chunks with page information based on page headers for chunk in chunks: # Look for page header in chunk text import re page_match = re.search(r"# Page (\d+)", chunk.text) if page_match: chunk.metadata["page_number"] = int(page_match.group(1)) # Apply minimum chunk size filtering chunks = self._filter_small_chunks(chunks) return chunks def _filter_small_chunks(self, chunks: List[Chunk]) -> List[Chunk]: """Filter out chunks smaller than min_chunk_size. Args: chunks: List of chunks to filter. Returns: List of chunks that meet the minimum size requirement. """ if self.min_chunk_size <= 0: return chunks filtered_chunks = [] filtered_count = 0 for chunk in chunks: chunk_text = chunk.text.strip() if len(chunk_text) >= self.min_chunk_size: filtered_chunks.append(chunk) else: filtered_count += 1 if filtered_count > 0: logger.info(f"Filtered out {filtered_count} chunks smaller than {self.min_chunk_size} characters") return filtered_chunks

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server