MCP PDF

Overview Schema Related Servers Score Discussions

mcp-pdf
src
mcp_pdf
mixins_official

document_assembly.py•14.9 KiB

""" Document Assembly Mixin - PDF merging, splitting, and page manipulation Uses official fastmcp.contrib.mcp_mixin pattern """ import asyncio import time import json from pathlib import Path from typing import Dict, Any, Optional, List import logging # PDF processing libraries import fitz # PyMuPDF # Official FastMCP mixin from fastmcp.contrib.mcp_mixin import MCPMixin, mcp_tool from ..security import validate_pdf_path, validate_output_path, sanitize_error_message logger = logging.getLogger(__name__) class DocumentAssemblyMixin(MCPMixin): """ Handles PDF document assembly operations including merging, splitting, and reordering. Uses the official FastMCP mixin pattern. """ def __init__(self): super().__init__() self.max_file_size = 100 * 1024 * 1024 # 100MB @mcp_tool( name="merge_pdfs", description="Merge multiple PDFs into one document" ) async def merge_pdfs( self, pdf_paths: str, output_path: str ) -> Dict[str, Any]: """ Merge multiple PDF files into a single document. Args: pdf_paths: JSON string containing list of PDF file paths output_path: Path where merged PDF will be saved Returns: Dictionary containing merge results """ start_time = time.time() try: # Parse input paths try: paths_list = json.loads(pdf_paths) except json.JSONDecodeError as e: return { "success": False, "error": f"Invalid JSON in pdf_paths: {e}", "merge_time": round(time.time() - start_time, 2) } if not isinstance(paths_list, list) or len(paths_list) < 2: return { "success": False, "error": "At least 2 PDF paths required for merging", "merge_time": round(time.time() - start_time, 2) } # Validate output path output_pdf_path = validate_output_path(output_path) # Validate and open all input PDFs input_docs = [] file_info = [] for i, pdf_path in enumerate(paths_list): try: validated_path = await validate_pdf_path(pdf_path) doc = fitz.open(str(validated_path)) input_docs.append(doc) file_info.append({ "index": i + 1, "path": str(validated_path), "pages": len(doc), "size_bytes": validated_path.stat().st_size }) except Exception as e: # Close any already opened docs for opened_doc in input_docs: opened_doc.close() return { "success": False, "error": f"Failed to open PDF {i + 1}: {sanitize_error_message(str(e))}", "merge_time": round(time.time() - start_time, 2) } # Create merged document merged_doc = fitz.open() total_pages_merged = 0 for i, doc in enumerate(input_docs): try: merged_doc.insert_pdf(doc) total_pages_merged += len(doc) logger.info(f"Merged document {i + 1}: {len(doc)} pages") except Exception as e: logger.error(f"Failed to merge document {i + 1}: {e}") # Save merged document merged_doc.save(str(output_pdf_path)) output_size = output_pdf_path.stat().st_size # Close all documents merged_doc.close() for doc in input_docs: doc.close() return { "success": True, "merge_summary": { "input_files": len(paths_list), "total_pages_merged": total_pages_merged, "output_size_bytes": output_size, "output_size_mb": round(output_size / (1024 * 1024), 2) }, "input_files": file_info, "output_info": { "output_path": str(output_pdf_path), "total_pages": total_pages_merged }, "merge_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF merge failed: {error_msg}") return { "success": False, "error": error_msg, "merge_time": round(time.time() - start_time, 2) } @mcp_tool( name="split_pdf", description="Split PDF into separate documents" ) async def split_pdf( self, pdf_path: str, split_method: str = "pages" ) -> Dict[str, Any]: """ Split PDF document into separate files. Args: pdf_path: Path to PDF file to split split_method: Method to use ("pages", "bookmarks", "ranges") Returns: Dictionary containing split results """ start_time = time.time() try: # Validate input path input_pdf_path = await validate_pdf_path(pdf_path) doc = fitz.open(str(input_pdf_path)) total_pages = len(doc) if total_pages <= 1: doc.close() return { "success": False, "error": "PDF must have more than 1 page to split", "split_time": round(time.time() - start_time, 2) } split_files = [] base_path = input_pdf_path.parent base_name = input_pdf_path.stem if split_method == "pages": # Split into individual pages for page_num in range(total_pages): output_path = base_path / f"{base_name}_page_{page_num + 1}.pdf" page_doc = fitz.open() page_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) page_doc.save(str(output_path)) page_doc.close() split_files.append({ "file_path": str(output_path), "pages": 1, "page_range": f"{page_num + 1}", "size_bytes": output_path.stat().st_size }) elif split_method == "bookmarks": # Split by bookmarks/table of contents toc = doc.get_toc() if not toc: doc.close() return { "success": False, "error": "No bookmarks found in PDF for bookmark-based splitting", "split_time": round(time.time() - start_time, 2) } # Create splits based on top-level bookmarks top_level_bookmarks = [item for item in toc if item[0] == 1] # Level 1 bookmarks for i, bookmark in enumerate(top_level_bookmarks): start_page = bookmark[2] - 1 # Convert to 0-based # Determine end page if i + 1 < len(top_level_bookmarks): end_page = top_level_bookmarks[i + 1][2] - 2 # Convert to 0-based, inclusive else: end_page = total_pages - 1 if start_page <= end_page: # Clean bookmark title for filename clean_title = "".join(c for c in bookmark[1] if c.isalnum() or c in (' ', '-', '_')).strip() clean_title = clean_title[:50] # Limit length output_path = base_path / f"{base_name}_{clean_title}.pdf" split_doc = fitz.open() split_doc.insert_pdf(doc, from_page=start_page, to_page=end_page) split_doc.save(str(output_path)) split_doc.close() split_files.append({ "file_path": str(output_path), "pages": end_page - start_page + 1, "page_range": f"{start_page + 1}-{end_page + 1}", "bookmark_title": bookmark[1], "size_bytes": output_path.stat().st_size }) elif split_method == "ranges": # Split into chunks of 10 pages each chunk_size = 10 chunks = (total_pages + chunk_size - 1) // chunk_size for chunk in range(chunks): start_page = chunk * chunk_size end_page = min(start_page + chunk_size - 1, total_pages - 1) output_path = base_path / f"{base_name}_pages_{start_page + 1}-{end_page + 1}.pdf" chunk_doc = fitz.open() chunk_doc.insert_pdf(doc, from_page=start_page, to_page=end_page) chunk_doc.save(str(output_path)) chunk_doc.close() split_files.append({ "file_path": str(output_path), "pages": end_page - start_page + 1, "page_range": f"{start_page + 1}-{end_page + 1}", "size_bytes": output_path.stat().st_size }) doc.close() total_output_size = sum(f["size_bytes"] for f in split_files) return { "success": True, "split_summary": { "split_method": split_method, "input_pages": total_pages, "output_files": len(split_files), "total_output_size_bytes": total_output_size, "total_output_size_mb": round(total_output_size / (1024 * 1024), 2) }, "split_files": split_files, "input_info": { "input_path": str(input_pdf_path), "total_pages": total_pages }, "split_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF split failed: {error_msg}") return { "success": False, "error": error_msg, "split_time": round(time.time() - start_time, 2) } @mcp_tool( name="reorder_pdf_pages", description="Reorder pages in PDF document" ) async def reorder_pdf_pages( self, pdf_path: str, page_order: str, output_path: str ) -> Dict[str, Any]: """ Reorder pages in a PDF document according to specified order. Args: pdf_path: Path to input PDF file page_order: JSON string with new page order (1-based page numbers) output_path: Path where reordered PDF will be saved Returns: Dictionary containing reorder results """ start_time = time.time() try: # Validate paths input_pdf_path = await validate_pdf_path(pdf_path) output_pdf_path = validate_output_path(output_path) # Parse page order try: order_list = json.loads(page_order) except json.JSONDecodeError as e: return { "success": False, "error": f"Invalid JSON in page_order: {e}", "reorder_time": round(time.time() - start_time, 2) } if not isinstance(order_list, list): return { "success": False, "error": "page_order must be a list of page numbers", "reorder_time": round(time.time() - start_time, 2) } # Open input document input_doc = fitz.open(str(input_pdf_path)) total_pages = len(input_doc) # Validate page numbers (convert to 0-based) valid_pages = [] invalid_pages = [] for page_num in order_list: try: page_index = int(page_num) - 1 # Convert to 0-based if 0 <= page_index < total_pages: valid_pages.append(page_index) else: invalid_pages.append(page_num) except (ValueError, TypeError): invalid_pages.append(page_num) if invalid_pages: input_doc.close() return { "success": False, "error": f"Invalid page numbers: {invalid_pages}. Pages must be between 1 and {total_pages}", "reorder_time": round(time.time() - start_time, 2) } # Create reordered document output_doc = fitz.open() for page_index in valid_pages: try: output_doc.insert_pdf(input_doc, from_page=page_index, to_page=page_index) except Exception as e: logger.warning(f"Failed to copy page {page_index + 1}: {e}") # Save reordered document output_doc.save(str(output_pdf_path)) output_size = output_pdf_path.stat().st_size input_doc.close() output_doc.close() return { "success": True, "reorder_summary": { "input_pages": total_pages, "output_pages": len(valid_pages), "pages_reordered": len(valid_pages), "output_size_bytes": output_size, "output_size_mb": round(output_size / (1024 * 1024), 2) }, "page_mapping": { "original_order": list(range(1, total_pages + 1)), "new_order": [p + 1 for p in valid_pages], "pages_duplicated": len(valid_pages) - len(set(valid_pages)), "pages_omitted": total_pages - len(set(valid_pages)) }, "output_info": { "output_path": str(output_pdf_path), "total_pages": len(valid_pages) }, "reorder_time": round(time.time() - start_time, 2) } except Exception as e: error_msg = sanitize_error_message(str(e)) logger.error(f"PDF page reorder failed: {error_msg}") return { "success": False, "error": error_msg, "reorder_time": round(time.time() - start_time, 2) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rsp2k/mcp-pdf'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

document_assembly.py•14.9 KiB