Skip to main content
Glama
juanqui
by juanqui
parser_unstructured.py7.72 kB
"""PDF parser using the Unstructured library.""" import logging from pathlib import Path from typing import Any, Dict, List from .parser import DocumentParser, PageContent, ParseResult logger = logging.getLogger(__name__) class UnstructuredPDFParser(DocumentParser): """PDF parser using the Unstructured library.""" def __init__(self, strategy: str = "fast", cache_dir: Path = None): """Initialize the Unstructured parser. Args: strategy: PDF processing strategy ("fast" or "hi_res"). cache_dir: Directory to cache parsed markdown files. """ super().__init__(cache_dir) self.strategy = strategy async def parse(self, file_path: Path) -> ParseResult: """Parse a PDF file using Unstructured library. Args: file_path: Path to the PDF file. Returns: ParseResult with markdown content and metadata. """ try: # Check cache first cache_path = None if self.cache_dir: cache_path = self._get_cache_path(file_path) if self._is_cache_valid(file_path, cache_path): logger.debug(f"Loading parsed content from cache: {cache_path}") markdown_content = self._load_from_cache(cache_path) metadata = self._load_metadata_from_cache(cache_path) if markdown_content is not None and metadata: # Create page-aware result # TODO: Implement proper page extraction for this parser pages = [PageContent(page_number=1, markdown_content=markdown_content, metadata={})] return ParseResult(pages=pages, metadata=metadata) from unstructured.partition.pdf import partition_pdf logger.debug(f"Partitioning PDF with Unstructured using '{self.strategy}' strategy: {file_path}") # Partition the PDF with enhanced options including strategy elements = partition_pdf( filename=str(file_path), strategy=self.strategy, # Use configured strategy ("fast" or "hi_res") extract_images_in_pdf=False, # Skip image extraction for performance infer_table_structure=True, # Extract table structure chunking_strategy="by_title", # Group by document structure max_characters=1000, # Default value, will be overridden by config new_after_n_chars=800, # Default value, will be overridden by config combine_text_under_n_chars=100, # Combine small elements ) # Convert elements to markdown markdown_content = self._elements_to_markdown(elements) # Extract metadata from elements metadata = self._extract_metadata_from_elements(elements) # Add processing information metadata["processing_timestamp"] = "N/A" # Will be set by PDFProcessor metadata["processor_version"] = "unstructured" metadata["source_filename"] = file_path.name metadata["source_directory"] = str(file_path.parent) # Save to cache if enabled if cache_path: logger.debug(f"Saving parsed content to cache: {cache_path}") self._save_to_cache(cache_path, markdown_content) self._save_metadata_to_cache(cache_path, metadata) logger.debug(f"Extracted {len(elements)} elements from PDF using Unstructured") # Create page-aware result # TODO: Implement proper page extraction for this parser pages = [PageContent(page_number=1, markdown_content=markdown_content, metadata={})] return ParseResult(pages=pages, metadata=metadata) except ImportError: raise ImportError("Unstructured library not available. Install with: pip install unstructured[pdf]") except Exception as e: raise RuntimeError(f"Failed to parse PDF with Unstructured: {e}") from e def _elements_to_markdown(self, elements: List[Any]) -> str: """Convert Unstructured elements to Markdown format. Args: elements: List of Unstructured elements. Returns: Markdown formatted text. """ markdown_lines = [] for element in elements: element_text = str(element).strip() if not element_text: continue element_type = type(element).__name__ # Format based on element type if element_type in ["Title", "Header"]: # Add as headers header_level = min(len(element_text.split()), 6) # Max 6 levels markdown_lines.append(f"{'#' * header_level} {element_text}") elif element_type == "Table": # Format tables markdown_lines.append(f"[TABLE]\n{element_text}\n[/TABLE]") elif element_type == "ListItem": # Format list items markdown_lines.append(f"- {element_text}") else: # Regular text markdown_lines.append(element_text) return "\n\n".join(markdown_lines) def _extract_metadata_from_elements(self, elements: List[Any]) -> Dict[str, Any]: """Extract metadata from Unstructured elements. Args: elements: List of Unstructured elements. Returns: Dictionary of extracted metadata. """ metadata = {} # Count pages and element types pages = set() element_types = {} for element in elements: # Track page numbers if hasattr(element, "metadata") and element.metadata: # Handle both dict and ElementMetadata object try: if hasattr(element.metadata, "page_number"): pages.add(element.metadata.page_number) elif isinstance(element.metadata, dict) and "page_number" in element.metadata: pages.add(element.metadata["page_number"]) except (AttributeError, KeyError, TypeError): pass # Extract document-level metadata from first element if not metadata: # Only extract once element_meta = element.metadata try: # Try attribute access first (ElementMetadata object) if hasattr(element_meta, "filename"): metadata["source_filename"] = element_meta.filename elif isinstance(element_meta, dict) and "filename" in element_meta: metadata["source_filename"] = element_meta["filename"] if hasattr(element_meta, "file_directory"): metadata["source_directory"] = element_meta.file_directory elif isinstance(element_meta, dict) and "file_directory" in element_meta: metadata["source_directory"] = element_meta["file_directory"] except (AttributeError, KeyError, TypeError): pass # Count element types element_type = str(type(element).__name__) element_types[element_type] = element_types.get(element_type, 0) + 1 metadata["page_count"] = len(pages) if pages else 1 metadata["element_types"] = element_types metadata["total_elements"] = len(elements) return metadata

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server