#!/usr/bin/env python3
"""
MCP tools for PDF to Markdown conversion.
"""
import os
from pathlib import Path
from server import mcp
from utils.converters import pdf_to_markdown, resolve_path
@mcp.tool()
def convert_pdf_to_markdown(
pdf_file_path: str,
output_filename: str = None
) -> str:
"""
Convert a PDF file to Markdown format.
Args:
pdf_file_path: Path to the PDF file to convert
output_filename: Name of the output Markdown file (if not provided, uses same name as input with .md extension)
Returns:
A message indicating success or failure of the conversion
"""
try:
# Resolve input path (handles both relative and absolute paths)
input_path = resolve_path(pdf_file_path)
if not input_path.exists():
return f"Error: Input file '{pdf_file_path}' does not exist at resolved path: {input_path}"
if not input_path.suffix.lower() == '.pdf':
return f"Error: '{pdf_file_path}' is not a PDF file. Please provide a .pdf file."
# Set output filename if not provided
if output_filename is None:
output_filename = input_path.stem + '.md'
elif not output_filename.lower().endswith('.md'):
output_filename += '.md'
# Resolve output path (handles both relative and absolute paths)
output_path = resolve_path(output_filename)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Convert PDF to Markdown
success, message = pdf_to_markdown(
pdf_path=str(input_path),
output_path=str(output_path)
)
if success:
file_size = output_path.stat().st_size if output_path.exists() else 0
# Read a preview of the content
try:
with open(output_path, 'r', encoding='utf-8') as f:
content = f.read()
preview = content[:500] + "..." if len(content) > 500 else content
return f"{message}\nInput: {pdf_file_path} (resolved to: {input_path})\nFile size: {file_size:,} bytes\n\nContent preview:\n{preview}"
except Exception:
return f"{message}\nInput: {pdf_file_path} (resolved to: {input_path})\nFile size: {file_size:,} bytes"
else:
return f"Conversion failed: {message}"
except Exception as e:
return f"Error during PDF to Markdown conversion: {str(e)}"
@mcp.tool()
def extract_text_from_pdf(
working_dir: str,
pdf_file_path: str,
page_numbers: str = "all"
) -> str:
"""
Extract text content from a PDF file without conversion to Markdown.
Args:
working_dir: Absolute path to the working directory for file operations
pdf_file_path: Path to the PDF file relative to working_dir
page_numbers: Page numbers to extract - "all" for all pages, or comma-separated numbers like "1,3,5" or "1-5"
Returns:
The extracted text content from the PDF
"""
try:
import pdfplumber
# Validate working directory
working_path = Path(working_dir)
if not working_path.is_absolute():
return f"Error: working_dir must be an absolute path, got: {working_dir}"
if not working_path.exists():
return f"Error: working_dir does not exist: {working_dir}"
if not working_path.is_dir():
return f"Error: working_dir is not a directory: {working_dir}"
# Create input path relative to working directory
input_path = working_path / pdf_file_path
if not input_path.exists():
return f"Error: Input file '{pdf_file_path}' does not exist at: {input_path}"
if not input_path.suffix.lower() == '.pdf':
return f"Error: '{pdf_file_path}' is not a PDF file. Please provide a .pdf file."
extracted_text = []
with pdfplumber.open(input_path) as pdf:
total_pages = len(pdf.pages)
# Parse page numbers
if page_numbers.lower() == "all":
pages_to_extract = range(total_pages)
else:
pages_to_extract = []
for part in page_numbers.split(','):
part = part.strip()
if '-' in part:
# Range like "1-5"
start, end = map(int, part.split('-'))
pages_to_extract.extend(range(start-1, min(end, total_pages)))
else:
# Single page
page_num = int(part) - 1 # Convert to 0-based
if 0 <= page_num < total_pages:
pages_to_extract.append(page_num)
# Extract text from specified pages
for page_idx in pages_to_extract:
page = pdf.pages[page_idx]
text = page.extract_text()
if text:
extracted_text.append(f"=== Page {page_idx + 1} ===\n{text}\n")
if extracted_text:
full_text = "\n".join(extracted_text)
return f"Successfully extracted text from {len(pages_to_extract)} page(s) of '{pdf_file_path}' (at: {input_path}):\n\n{full_text}"
else:
return f"No text content found in the specified pages of '{pdf_file_path}' (at: {input_path})"
except Exception as e:
return f"Error during text extraction: {str(e)}"
@mcp.tool()
def get_pdf_info(
working_dir: str,
pdf_file_path: str
) -> str:
"""
Get information about a PDF file (number of pages, metadata, etc.).
Args:
working_dir: Absolute path to the working directory for file operations
pdf_file_path: Path to the PDF file relative to working_dir
Returns:
Information about the PDF file
"""
try:
import pdfplumber
# Validate working directory
working_path = Path(working_dir)
if not working_path.is_absolute():
return f"Error: working_dir must be an absolute path, got: {working_dir}"
if not working_path.exists():
return f"Error: working_dir does not exist: {working_dir}"
if not working_path.is_dir():
return f"Error: working_dir is not a directory: {working_dir}"
# Create input path relative to working directory
input_path = working_path / pdf_file_path
if not input_path.exists():
return f"Error: Input file '{pdf_file_path}' does not exist at: {input_path}"
if not input_path.suffix.lower() == '.pdf':
return f"Error: '{pdf_file_path}' is not a PDF file. Please provide a .pdf file."
info_lines = []
info_lines.append(f"PDF File: {pdf_file_path} (at: {input_path})")
info_lines.append(f"File Size: {input_path.stat().st_size:,} bytes")
with pdfplumber.open(input_path) as pdf:
info_lines.append(f"Number of Pages: {len(pdf.pages)}")
# Get metadata if available
metadata = pdf.metadata
if metadata:
info_lines.append("\nMetadata:")
for key, value in metadata.items():
if value:
info_lines.append(f" {key}: {value}")
# Get page dimensions for first page
if pdf.pages:
first_page = pdf.pages[0]
info_lines.append(f"\nPage Dimensions (first page):")
info_lines.append(f" Width: {first_page.width:.1f} points")
info_lines.append(f" Height: {first_page.height:.1f} points")
# Try to get some text from first page as sample
sample_text = first_page.extract_text()
if sample_text:
preview = sample_text[:200] + "..." if len(sample_text) > 200 else sample_text
info_lines.append(f"\nSample text from first page:\n{preview}")
return "\n".join(info_lines)
except Exception as e:
return f"Error getting PDF info: {str(e)}"