MCP PDF

Overview Schema Related Servers Score Discussions

table_extraction.py•7 KiB

""" Table Extraction Mixin - PDF table detection and extraction capabilities """ import time import logging from pathlib import Path from typing import Dict, Any, List, Optional # PDF processing libraries import camelot import tabula import pdfplumber import pandas as pd from .base import MCPMixin, mcp_tool from ..security import validate_pdf_path, parse_pages_parameter, sanitize_error_message logger = logging.getLogger(__name__) class TableExtractionMixin(MCPMixin): """ Handles all PDF table extraction operations with intelligent fallbacks. Tools provided: - extract_tables: Multi-method table extraction with automatic fallbacks """ def get_mixin_name(self) -> str: return "TableExtraction" def get_required_permissions(self) -> List[str]: return ["read_files", "table_processing"] def _setup(self): """Initialize table extraction specific configuration""" self.table_accuracy_threshold = 0.8 self.max_tables_per_page = 10 @mcp_tool( name="extract_tables", description="Extract tables from PDF with automatic method selection and intelligent fallbacks" ) async def extract_tables( self, pdf_path: str, pages: Optional[str] = None, method: str = "auto", table_format: str = "json" ) -> Dict[str, Any]: """ Extract tables from PDF using various methods with automatic fallbacks. Args: pdf_path: Path to PDF file or URL pages: Page specification (e.g., "1-5,10,15-20" or "all") method: Extraction method ("auto", "camelot", "tabula", "pdfplumber") table_format: Output format ("json", "csv", "markdown") Returns: Dictionary containing extracted tables and metadata """ start_time = time.time() try: # Validate inputs using centralized security functions path = await validate_pdf_path(pdf_path) parsed_pages = parse_pages_parameter(pages) all_tables = [] methods_tried = [] # Auto method: try methods in order until we find tables if method == "auto": for try_method in ["camelot", "pdfplumber", "tabula"]: methods_tried.append(try_method) if try_method == "camelot": tables = self._extract_tables_camelot(path, parsed_pages) elif try_method == "pdfplumber": tables = self._extract_tables_pdfplumber(path, parsed_pages) elif try_method == "tabula": tables = self._extract_tables_tabula(path, parsed_pages) if tables: method = try_method all_tables = tables break else: # Use specific method methods_tried.append(method) if method == "camelot": all_tables = self._extract_tables_camelot(path, parsed_pages) elif method == "pdfplumber": all_tables = self._extract_tables_pdfplumber(path, parsed_pages) elif method == "tabula": all_tables = self._extract_tables_tabula(path, parsed_pages) else: raise ValueError(f"Unknown table extraction method: {method}") # Format tables based on output format formatted_tables = [] for i, df in enumerate(all_tables): if table_format == "json": formatted_tables.append({ "table_index": i, "data": df.to_dict(orient="records"), "shape": {"rows": len(df), "columns": len(df.columns)} }) elif table_format == "csv": formatted_tables.append({ "table_index": i, "data": df.to_csv(index=False), "shape": {"rows": len(df), "columns": len(df.columns)} }) elif table_format == "markdown": formatted_tables.append({ "table_index": i, "data": df.to_markdown(index=False), "shape": {"rows": len(df), "columns": len(df.columns)} }) return { "success": True, "tables": formatted_tables, "total_tables": len(formatted_tables), "method_used": method, "methods_tried": methods_tried, "pages_searched": pages or "all", "processing_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"Table extraction failed: {error_msg}") return { "success": False, "error": error_msg, "methods_tried": methods_tried, "processing_time": round(time.time() - start_time, 2) } # Private helper methods (all synchronous for proper async pattern) def _extract_tables_camelot(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Camelot""" page_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all' # Try lattice mode first (for bordered tables) try: tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='lattice') if len(tables) > 0: return [table.df for table in tables] except Exception: pass # Fall back to stream mode (for borderless tables) try: tables = camelot.read_pdf(str(pdf_path), pages=page_str, flavor='stream') return [table.df for table in tables] except Exception: return [] def _extract_tables_tabula(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using Tabula""" page_list = [p+1 for p in pages] if pages else 'all' try: tables = tabula.read_pdf(str(pdf_path), pages=page_list, multiple_tables=True) return tables except Exception: return [] def _extract_tables_pdfplumber(self, pdf_path: Path, pages: Optional[List[int]] = None) -> List[pd.DataFrame]: """Extract tables using pdfplumber""" tables = [] with pdfplumber.open(str(pdf_path)) as pdf: page_range = pages if pages else range(len(pdf.pages)) for page_num in page_range: page = pdf.pages[page_num] page_tables = page.extract_tables() for table in page_tables: if table and len(table) > 1: # Skip empty tables df = pd.DataFrame(table[1:], columns=table[0]) tables.append(df) return tables

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rsp2k/mcp-pdf'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

table_extraction.py•7 KiB