pdf_auto_crop_page
Automatically crop PDF pages by detecting content boundaries to remove blank margins and optimize document layout for better viewing and printing.
Instructions
Automatically crop a PDF page to remove blank margins by detecting content boundaries.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| pdf_path | Yes | ||
| page_number | No | ||
| padding | No |
Implementation Reference
- The core handler function for the 'pdf_auto_crop_page' tool. Decorated with @mcp.tool(), which registers it automatically with the FastMCP server. Detects content (text words, images, drawings), computes bounding box union, applies conservative asymmetric padding, and sets cropbox if margins are significant. Handles single or all pages.@mcp.tool() async def pdf_auto_crop_page( pdf_path: str, page_number: Optional[int] = None, padding: float = 10.0 ) -> str: """Automatically crop a PDF page to remove blank margins by detecting content boundaries.""" if not os.path.exists(pdf_path): return f"Error: PDF file not found: {pdf_path}" if not validate_pdf_file(pdf_path): return f"Error: Invalid PDF file: {pdf_path}" try: # Open PDF document doc = fitz.open(pdf_path) # Determine pages to process if page_number is not None: if not validate_page_number(doc, page_number): doc.close() return f"Error: Invalid page number {page_number}. Document has {len(doc)} pages." pages_to_process = [page_number] else: pages_to_process = list(range(len(doc))) cropped_pages = 0 for page_num in pages_to_process: page = doc[page_num] # Get text at word level for tighter bounds words = page.get_text("words") text_rects = [word[:4] for word in words if len(word) >= 4] # Get image rectangles images = page.get_images() image_rects = [img[:4] for img in images if len(img) >= 4] # Get drawing objects (lines, shapes, paths) - NO external dependencies drawing_rects = [] try: drawings = page.get_drawings() for drawing in drawings: if 'rect' in drawing: drawing_rects.append(drawing['rect']) except Exception: pass # Combine all rectangles all_rects = text_rects + image_rects + drawing_rects # Filter out invalid rectangles (outside page bounds or with invalid coordinates) page_rect = page.rect valid_rects = [] for rect in all_rects: if len(rect) >= 4: try: r = fitz.Rect(rect[:4]) # Check if rectangle is valid and within reasonable bounds if (r.is_valid and r.x0 >= 0 and r.y0 >= 0 and r.x1 <= page_rect.width and r.y1 <= page_rect.height and r.width > 0 and r.height > 0): valid_rects.append(rect[:4]) except Exception: continue all_rects = valid_rects if all_rects: # Calculate union of all content rectangles content_rect = fitz.Rect(all_rects[0]) for rect in all_rects[1:]: content_rect |= fitz.Rect(rect) # More conservative padding strategy to preserve section flow # Use asymmetric padding: less aggressive on sides, more generous on top/bottom page_rect = page.rect # Calculate how much we can crop while preserving document flow # Only crop if there's significant margin (at least 2 points on each side) margin_threshold = 2.0 # Check if margins are significant enough to warrant cropping left_margin = content_rect.x0 right_margin = page_rect.width - content_rect.x1 top_margin = content_rect.y0 bottom_margin = page_rect.height - content_rect.y1 # Only crop if margins are substantial if (left_margin > margin_threshold or right_margin > margin_threshold or top_margin > margin_threshold or bottom_margin > margin_threshold): # Conservative padding: preserve more space for better flow conservative_padding = max(padding, 20.0) # At least 20 points padding # Asymmetric padding: less on sides, more on top/bottom for better section flow content_rect = content_rect + [ -min(conservative_padding * 0.5, left_margin * 0.8), # left: 50% of padding or 80% of margin -min(conservative_padding, bottom_margin * 0.8), # bottom: full padding or 80% of margin min(conservative_padding * 0.5, right_margin * 0.8), # right: 50% of padding or 80% of margin min(conservative_padding, top_margin * 0.8) # top: full padding or 80% of margin ] # Ensure the crop box is within page bounds content_rect.intersect(page_rect) # Apply crop if there's any reduction in size if (content_rect.width < page_rect.width or content_rect.height < page_rect.height): page.set_cropbox(content_rect) cropped_pages += 1 else: # No content found, skip this page continue if cropped_pages == 0: doc.close() return "No content found to crop on any pages." # Generate output filename output_path = generate_output_filename(pdf_path, "auto_cropped") # Save the modified PDF doc.save(output_path) doc.close() page_info = f"page {page_number + 1}" if page_number is not None else f"{cropped_pages} pages" return f"Successfully auto-cropped {page_info}. Output saved to: {output_path}" except Exception as e: return f"Error auto-cropping PDF: {str(e)}"