Skip to main content
Glama
juanqui
by juanqui
chunker_unstructured.py3.42 kB
"""Chunker using the unstructured library for markdown text chunking.""" import logging from datetime import datetime, timezone from typing import Any, Dict, List from ..models import Chunk from .chunker import Chunker logger = logging.getLogger(__name__) class ChunkerUnstructured(Chunker): """Chunker using the unstructured library for markdown text chunking.""" def __init__(self, cache_dir: str = None, min_chunk_size: int = 0): """Initialize the unstructured chunker with zero configuration. Args: cache_dir: Directory to cache chunked results (not used in this implementation). min_chunk_size: Minimum size for chunks (0 = disabled). """ super().__init__(cache_dir=cache_dir, min_chunk_size=min_chunk_size) try: import unstructured from unstructured.partition.text import partition_text self.partition_text = partition_text # Handle cases where __version__ might not exist self.unstructured_version = getattr(unstructured, "__version__", "unknown") except ImportError: raise ImportError( "Unstructured library not available. Install with: pip install pdfkb-mcp[unstructured_chunker]" ) def chunk(self, markdown_content: str, metadata: Dict[str, Any]) -> List[Chunk]: """Chunk markdown content using unstructured library. Args: markdown_content: Markdown text to chunk. metadata: Document metadata. Returns: List of Chunk objects. """ try: if not markdown_content or not markdown_content.strip(): logger.warning("Empty markdown content provided to chunker") return [] # Partition text using unstructured with built-in chunking elements = self.partition_text( text=markdown_content, chunking_strategy="by_title", max_characters=1000, new_after_n_chars=800, combine_text_under_n_chars=150, ) # Convert elements to Chunk objects chunks = [] for i, element in enumerate(elements): chunk_text = str(element).strip() if not chunk_text: # Skip empty chunks continue # Create metadata for this chunk chunk_metadata = { "chunk_strategy": "unstructured_by_title", "element_type": element.__class__.__name__, "unstructured_version": self.unstructured_version, "max_characters": 1000, "created_at": datetime.now(timezone.utc).isoformat(), } # Add any provided metadata chunk_metadata.update(metadata) chunk = Chunk(text=chunk_text, chunk_index=i, metadata=chunk_metadata) chunks.append(chunk) # Apply minimum chunk size filtering chunks = self._filter_small_chunks(chunks) logger.info(f"Created {len(chunks)} chunks using unstructured library") return chunks except Exception as e: logger.error(f"Failed to chunk markdown content with unstructured: {e}") raise RuntimeError(f"Failed to chunk text: {e}") from e

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server