Skip to main content
Glama
marc-hanheide

PDF Redaction MCP Server

server.py30.5 kB
"""PDF Redaction MCP Server. This MCP server provides tools for reading and redacting PDF files using PyMuPDF (fitz). """ import fitz # PyMuPDF from pathlib import Path from typing import Annotated from fastmcp import FastMCP, Context from fastmcp.exceptions import ToolError from pydantic import Field # Create the MCP server mcp = FastMCP(name="PDF Redaction Server") # Store for loaded PDFs (in-memory, keyed by file path) _loaded_pdfs: dict[str, fitz.Document] = {} # Store for applied redactions (keyed by file path, containing list of redacted texts) _applied_redactions: dict[str, list[str]] = {} @mcp.tool async def load_pdf( pdf_path: Annotated[str, Field(description="Path to the PDF file to load")], ctx: Context ) -> str: """Load a PDF file and make it available for redaction. This tool loads a PDF file into memory and extracts its text content for review. The PDF remains loaded for subsequent redaction operations. Args: pdf_path: Path to the PDF file to load ctx: MCP context for logging Returns: The full text content of the PDF Raises: ToolError: If the file doesn't exist or cannot be opened """ try: path = Path(pdf_path).resolve() await ctx.info(f"Loading PDF from: {path}") if not path.exists(): raise ToolError(f"PDF file not found: {path}") if not path.is_file(): raise ToolError(f"Path is not a file: {path}") # Open the PDF doc = fitz.open(str(path)) # Store the document for later use _loaded_pdfs[str(path)] = doc # Initialize redaction tracking for this PDF if str(path) not in _applied_redactions: _applied_redactions[str(path)] = [] # Extract text from all pages text_content = [] for page_num, page in enumerate(doc, start=1): page_text = page.get_text() text_content.append(f"--- Page {page_num} ---\n{page_text}") full_text = "\n\n".join(text_content) await ctx.info(f"Successfully loaded PDF with {len(doc)} pages") return full_text except ToolError: raise except Exception as e: await ctx.error(f"Failed to load PDF: {str(e)}") raise ToolError(f"Failed to load PDF: {str(e)}") @mcp.tool async def redact_text( pdf_path: Annotated[str, Field(description="Path to the loaded PDF file")], texts_to_redact: Annotated[list[str], Field(description="List of text strings to search for and redact")], fill_color: Annotated[tuple[float, float, float], Field( description="RGB color for redaction (values 0-1). Default is black (0,0,0)" )] = (0, 0, 0), ctx: Context = None ) -> str: """Redact specific texts in a loaded PDF. This tool searches for all instances of the specified texts in the PDF and adds redaction annotations over them. The redactions are not yet applied to the document - use save_redacted_pdf to apply and save. Only texts that haven't been previously redacted will be processed. Args: pdf_path: Path to the PDF file (must be already loaded) texts_to_redact: List of text strings to search for and redact fill_color: RGB color tuple (0-1 range) for the redaction box. Default is black. ctx: MCP context for logging Returns: Summary of redaction operations Raises: ToolError: If the PDF is not loaded or redaction fails """ try: path = Path(pdf_path).resolve() path_str = str(path) await ctx.info(f"Redacting texts in: {path}") # Check if PDF is loaded if path_str not in _loaded_pdfs: raise ToolError( f"PDF not loaded. Please load it first using load_pdf: {path}" ) doc = _loaded_pdfs[path_str] # Initialize redaction tracking for this PDF if not exists if path_str not in _applied_redactions: _applied_redactions[path_str] = [] # Validate color values if not all(0 <= c <= 1 for c in fill_color): raise ToolError("RGB color values must be between 0 and 1") # Filter out already redacted texts already_redacted = [] new_texts = [] for text in texts_to_redact: if text in _applied_redactions[path_str]: already_redacted.append(text) else: new_texts.append(text) if not new_texts: skipped_msg = f"All {len(texts_to_redact)} text(s) have already been redacted. No new redactions added." await ctx.info(skipped_msg) return skipped_msg total_redactions = 0 text_summaries = [] # Process each text to redact for text_to_redact in new_texts: text_redaction_count = 0 page_redactions = [] # Search and redact on each page for page_num, page in enumerate(doc, start=1): # Search for the text text_instances = page.search_for(text_to_redact) if text_instances: # Add redaction annotations for each instance for inst in text_instances: page.add_redact_annot(inst, fill=fill_color) text_redaction_count += 1 total_redactions += 1 page_redactions.append(f"Page {page_num}: {len(text_instances)} instance(s)") if text_redaction_count > 0: # Track this redaction _applied_redactions[path_str].append(text_to_redact) text_summaries.append( f" '{text_to_redact}': {text_redaction_count} instance(s) across {len(page_redactions)} page(s)" ) else: text_summaries.append(f" '{text_to_redact}': No instances found") if total_redactions == 0: msg = f"No instances of the provided texts found in the PDF" await ctx.warning(msg) return msg summary_parts = [f"Added {total_redactions} redaction(s) for {len(new_texts)} text(s):"] summary_parts.extend(text_summaries) if already_redacted: summary_parts.append(f"\nSkipped {len(already_redacted)} text(s) already redacted: {', '.join(repr(t) for t in already_redacted)}") summary_parts.append( "\nNote: Redactions are marked but not yet applied. " "Use save_redacted_pdf to apply and save the changes." ) summary = "\n".join(summary_parts) await ctx.info(f"Added {total_redactions} redaction annotations for {len(new_texts)} new text(s)") return summary except ToolError: raise except Exception as e: await ctx.error(f"Failed to redact text: {str(e)}") raise ToolError(f"Failed to redact text: {str(e)}") @mcp.tool async def redact_area( pdf_path: Annotated[str, Field(description="Path to the loaded PDF file")], page_number: Annotated[int, Field(description="Page number (1-indexed)", ge=1)], x0: Annotated[float, Field(description="Left x coordinate")], y0: Annotated[float, Field(description="Top y coordinate")], x1: Annotated[float, Field(description="Right x coordinate")], y1: Annotated[float, Field(description="Bottom y coordinate")], fill_color: Annotated[tuple[float, float, float], Field( description="RGB color for redaction (values 0-1). Default is black (0,0,0)" )] = (0, 0, 0), ctx: Context = None ) -> str: """Redact a specific rectangular area on a PDF page. This tool adds a redaction annotation for a specific rectangular area defined by coordinates. The redactions are not yet applied to the document - use save_redacted_pdf to apply and save. Args: pdf_path: Path to the PDF file (must be already loaded) page_number: Page number to redact (1-indexed) x0: Left x coordinate of the rectangle y0: Top y coordinate of the rectangle x1: Right x coordinate of the rectangle y1: Bottom y coordinate of the rectangle fill_color: RGB color tuple (0-1 range) for the redaction box. Default is black. ctx: MCP context for logging Returns: Confirmation message Raises: ToolError: If the PDF is not loaded, page doesn't exist, or redaction fails """ try: path = Path(pdf_path).resolve() path_str = str(path) await ctx.info(f"Redacting area on page {page_number} in: {path}") # Check if PDF is loaded if path_str not in _loaded_pdfs: raise ToolError( f"PDF not loaded. Please load it first using load_pdf: {path}" ) doc = _loaded_pdfs[path_str] # Validate page number if page_number < 1 or page_number > len(doc): raise ToolError( f"Invalid page number {page_number}. PDF has {len(doc)} pages." ) # Validate color values if not all(0 <= c <= 1 for c in fill_color): raise ToolError("RGB color values must be between 0 and 1") # Get the page (0-indexed internally) page = doc[page_number - 1] # Create rectangle and add redaction rect = fitz.Rect(x0, y0, x1, y1) page.add_redact_annot(rect, fill=fill_color) await ctx.info(f"Added area redaction on page {page_number}") return ( f"Added redaction for area ({x0}, {y0}, {x1}, {y1}) on page {page_number}.\n" + "Note: Redaction is marked but not yet applied. " + "Use save_redacted_pdf to apply and save the changes." ) except ToolError: raise except Exception as e: await ctx.error(f"Failed to redact area: {str(e)}") raise ToolError(f"Failed to redact area: {str(e)}") @mcp.tool async def save_redacted_pdf( pdf_path: Annotated[str, Field(description="Path to the loaded PDF file")], output_path: Annotated[str | None, Field( description="Optional output path. If not provided, appends '_redacted' to the original filename" )] = None, ctx: Context = None ) -> str: """Apply all redactions and save the redacted PDF. This tool applies all pending redaction annotations to the PDF and saves it. By default, it saves to a new file with '_redacted' appended to the original filename. Args: pdf_path: Path to the PDF file (must be already loaded) output_path: Optional custom output path. If not provided, saves as '<original_name>_redacted.pdf' ctx: MCP context for logging Returns: Path to the saved redacted PDF Raises: ToolError: If the PDF is not loaded or save fails """ try: path = Path(pdf_path).resolve() path_str = str(path) await ctx.info(f"Saving redacted PDF: {path}") # Check if PDF is loaded if path_str not in _loaded_pdfs: raise ToolError( f"PDF not loaded. Please load it first using load_pdf: {path}" ) doc = _loaded_pdfs[path_str] # Determine output path if output_path: out_path = Path(output_path).resolve() else: # Append '_redacted' to the original filename out_path = path.parent / f"{path.stem}_redacted{path.suffix}" # Count total redactions before applying total_redactions = 0 for page in doc: # Apply redactions on this page redact_count = page.apply_redactions() if redact_count: total_redactions += 1 # Note: apply_redactions returns True if any were applied # Save the document doc.save(str(out_path)) await ctx.info(f"Saved redacted PDF to: {out_path}") return ( f"Successfully applied redactions and saved to: {out_path}\n" f"Redactions applied on {total_redactions} page(s)." ) except ToolError: raise except Exception as e: await ctx.error(f"Failed to save redacted PDF: {str(e)}") raise ToolError(f"Failed to save redacted PDF: {str(e)}") @mcp.tool async def list_loaded_pdfs(ctx: Context = None) -> str: """List all currently loaded PDF files. Returns: List of loaded PDF file paths """ if not _loaded_pdfs: return "No PDFs currently loaded." loaded_list = "\n".join( f"- {path} ({len(doc)} pages)" for path, doc in _loaded_pdfs.items() ) await ctx.info(f"Currently {len(_loaded_pdfs)} PDF(s) loaded") return f"Currently loaded PDFs:\n{loaded_list}" @mcp.tool async def close_pdf( pdf_path: Annotated[str, Field(description="Path to the PDF file to close")], ctx: Context = None ) -> str: """Close a loaded PDF and free its resources. Args: pdf_path: Path to the PDF file to close ctx: MCP context for logging Returns: Confirmation message Raises: ToolError: If the PDF is not loaded """ try: path = Path(pdf_path).resolve() path_str = str(path) if path_str not in _loaded_pdfs: raise ToolError(f"PDF not loaded: {path}") # Close the document _loaded_pdfs[path_str].close() del _loaded_pdfs[path_str] # Also clear redaction tracking for this PDF if path_str in _applied_redactions: del _applied_redactions[path_str] await ctx.info(f"Closed PDF: {path}") return f"Successfully closed PDF: {path}" except ToolError: raise except Exception as e: await ctx.error(f"Failed to close PDF: {str(e)}") raise ToolError(f"Failed to close PDF: {str(e)}") @mcp.tool async def list_applied_redactions( pdf_path: Annotated[str | None, Field( description="Optional path to a specific PDF file. If not provided, lists redactions for all loaded PDFs" )] = None, ctx: Context = None ) -> str: """List all redactions that have been applied to loaded PDF(s). This tool shows which texts have been marked for redaction in each PDF, helping to avoid duplicate redactions and track what has already been processed. Args: pdf_path: Optional path to a specific PDF. If not provided, lists all PDFs. ctx: MCP context for logging Returns: List of applied redactions for the specified PDF(s) Raises: ToolError: If a specific PDF path is provided but not loaded """ try: if pdf_path: # List redactions for a specific PDF path = Path(pdf_path).resolve() path_str = str(path) if path_str not in _loaded_pdfs: raise ToolError(f"PDF not loaded: {path}") redactions = _applied_redactions.get(path_str, []) if not redactions: await ctx.info(f"No redactions applied to: {path}") return f"No redactions have been applied to: {path}" redaction_list = "\n".join(f" - '{text}'" for text in redactions) result = ( f"Applied redactions for: {path}\n" f"Total: {len(redactions)} text(s) redacted\n\n" f"{redaction_list}" ) await ctx.info(f"Listed {len(redactions)} redaction(s) for {path}") return result else: # List redactions for all loaded PDFs if not _applied_redactions: await ctx.info("No redactions applied to any PDF") return "No redactions have been applied to any loaded PDFs." results = [] total_count = 0 for pdf_path_str, redactions in _applied_redactions.items(): if redactions: # Only show PDFs with redactions redaction_list = "\n".join(f" - '{text}'" for text in redactions) results.append( f" {pdf_path_str}:\n" f" Total: {len(redactions)} text(s)\n" f"{redaction_list}" ) total_count += len(redactions) if not results: await ctx.info("No redactions applied to any PDF") return "No redactions have been applied to any loaded PDFs." result = ( f"Applied redactions across all loaded PDFs:\n" f"Total PDFs with redactions: {len(results)}\n" f"Total texts redacted: {total_count}\n\n" + "\n\n".join(results) ) await ctx.info(f"Listed redactions for {len(results)} PDF(s), {total_count} total texts") return result except ToolError: raise except Exception as e: await ctx.error(f"Failed to list redactions: {str(e)}") raise ToolError(f"Failed to list redactions: {str(e)}") @mcp.prompt async def redact_pii(pdf_path: Annotated[str, Field(description="Path to the PDF file to redact")]) -> str: """Generate a comprehensive prompt for redacting personally identifiable information (PII) from a PDF. This prompt guides the LLM to identify and redact all information that could identify individuals, including names, institutions, ethnicity, gender, and any other identifying information. Args: pdf_path: Path to the PDF file to be redacted Returns: A list of messages with instructions and context for PII redaction """ prompt_text = f"""You are a Privacy Protection Specialist tasked with redacting personally identifiable information (PII) from a PDF document. **Document to Redact:** `{pdf_path}` **Your Task:** 1. First, load and carefully review the entire PDF document using the `load_pdf` tool 2. Identify ALL information that could reveal the identity of any individual 3. Redact every instance of such information using the `redact_text` tool 4. Save the redacted PDF using the `save_redacted_pdf` tool **Categories of Information to Redact:** **1. Personal Identifiers:** - Full names (first, middle, last names) - Nicknames, aliases, or usernames - Initials when they could identify someone - Maiden names or previous names **2. Educational/Institutional Affiliations:** - University names (e.g., "Stanford University", "MIT") - College names - Department names that could identify an institution - Degree programs or majors (when combined with other info) - Student ID numbers - Academic advisor names - Thesis or dissertation titles that are unique **3. Demographic Information:** - Ethnicity or race (e.g., "Asian", "Hispanic", "Caucasian") - Country of origin or nationality - Native language (when identifying) - Gender or sex (e.g., "male", "female", "man", "woman", "he", "she", pronouns) - Age or date of birth - Marital status **4. Contact Information:** - Email addresses - Phone numbers - Physical addresses (street, city, state, zip) - Social media handles - Website URLs that lead to personal profiles **5. Professional Information:** - Current employer names - Job titles (when identifying) - Work locations - LinkedIn profile URLs - ORCID or ResearcherID numbers **6. Identifying Links and References:** - Personal website URLs - GitHub profile links - Google Scholar profiles - ResearchGate or Academia.edu profiles - Personal blog links - Any URL containing a person's name **7. Unique Identifiers:** - Social Security Numbers - Passport numbers - Driver's license numbers - Credit card numbers - National ID numbers - Any reference numbers that could identify individuals **8. Contextual Identifiers:** - Specific dates that could identify someone clearly (birth dates) - Photographs or image captions mentioning names - Signatures or handwritten names - Award names that identify recipients clearly - Publication authors **Important Guidelines:** 1. **Be Thorough:** Read the document carefully. 2. **Context Matters:** Consider whether information could be cross-referenced with public sources to identify someone. For example: - "PhD student in Computer Science at [REDACTED] working on neural networks" → also redact field specifics if too identifying - Unique research topics or rare specializations - Specific project names that appear in public databases 3. **Preserve Structure:** Try to maintain the document's overall structure and readability where possible. 4. **Generic Terms:** Replace with generic terms when appropriate: - Names → [NAME REDACTED] - Universities → [INSTITUTION REDACTED] - Countries → [LOCATION REDACTED] - Dates → [DATE REDACTED] **Your Step-by-Step Process:** 1. Load the PDF: `load_pdf("{pdf_path}")` 2. Review the content and identify ALL PII according to the categories above 3. For each piece of identifying information found: - Use `redact_text` to redact that specific text - Example: `redact_text("{pdf_path}", "John Smith")` 4. After redacting all PII, save: `save_redacted_pdf("{pdf_path}")` 5. Provide a summary of: - Total number of items redacted - Categories of information redacted - Any challenges or uncertain cases **Remember:** The goal is complete anonymization. No one should be able to identify any individual from the redacted document, even with access to external databases or public information. But retain as much information to facilitate an objective assessment of the document's content and purpose as possible. Begin by loading and reviewing the PDF document.""" return prompt_text @mcp.prompt async def redact_for_anonymous_recruitment(pdf_path: Annotated[str, Field(description="Path to the PhD application PDF to redact")]) -> str: """Generate a comprehensive prompt for redacting identifying information from PhD applications for anonymous recruitment. This prompt is specifically designed for the anonymous recruitment process to mitigate unconscious bias and ensure fairness and equality of opportunity. It redacts all identifying information while preserving content that demonstrates the applicant's skills, talent, passion, and fit for the position. Args: pdf_path: Path to the PhD application PDF to be redacted Returns: A detailed prompt with instructions for anonymous recruitment redaction """ prompt_text = f"""You are an Anonymous Recruitment Specialist tasked with redacting identifying information from a PhD application to ensure a fair and unbiased shortlisting process. **Document to Redact:** `{pdf_path}` **Context:** This application is for a PhD position. The anonymous recruitment process is essential to mitigate unconscious bias related to gender, ethnicity, nationality, age, and other protected characteristics, ensuring all applicants have an equal and fair chance based solely on their skills, qualifications, passion, and fit for the position. **Your Task:** 1. Load and carefully review the entire application using the `load_pdf` tool 2. Identify ALL information that could reveal the applicant's identity or protected characteristics 3. Redact every instance of such information using the `redact_text` tool 4. Save the redacted application using the `save_redacted_pdf` tool **CRITICAL: Information That MUST Be Redacted:** **1. Personal Identifiers:** - Full names (first, middle, last, maiden names) - Titles (Mr., Mrs., Ms., Dr., Prof., etc.) - Nicknames, aliases, or usernames - Initials that could identify the applicant - Signatures or handwritten names - Photographs or images of the applicant **2. Protected Characteristics:** - Age or date of birth (e.g., "25 years old", "born in 1998") - Gender or sex (e.g., "male", "female", "woman", "man") - Gender pronouns (he, she, his, her, him - use they/their/them instead if needed for clarity) - Ethnicity or race (e.g., "Asian", "Black", "Caucasian", "Hispanic") - Nationality or country of origin (e.g., "British", "Chinese", "Nigerian") - Residency status (e.g., "UK resident", "international student", "home student") - Native language or mother tongue (e.g., "English is my first language") - Religion or philosophical beliefs - Marital status (e.g., "married", "single", "parent") - Disability or health conditions - Sexual orientation - Political opinions - Phrases indicating protected characteristics (e.g., "as a mature student", "as a British applicant") **3. Contact Information:** - Email addresses - Phone numbers (mobile and landline) - Physical addresses (street, city, state, postal code) - Social media handles or profiles - Personal website URLs **4. Educational Institutions (Names Only):** - University names (e.g., "University of Lincoln") - College or school names - Department names when they identify an institution - Location of institutions that could identify them - Supervisor or advisor names - Thesis or dissertation titles (full titles, but can mention topic area generally) - Student ID numbers **5. Employment Information:** - Current or previous employer names - Specific company names - Work addresses or locations - Colleague or supervisor names at workplaces - Job titles that are overly specific or identifying **6. Publication References (Identifying Details):** - Article or paper titles (full titles) - Author names on publications - DOI numbers that link to named authors - Conference presentation titles - URLs or links to publications with author names - Unique identifiers (ORCID, ResearcherID, Google Scholar profile) - Co-author names **EXCEPTION for Publications:** You MAY retain: - Journal names (e.g., "Published in Nature Communications") - Conference names (e.g., "Presented at ICML") - Publication counts (e.g., "3 peer-reviewed publications") - General topic descriptions (e.g., "Published work on machine learning for robotics") - Impact factors or journal rankings if mentioned **7. Awards and Recognition:** - Specific award names that identify recipients (if searchable) - Award ceremony locations or dates if identifying - Names of award organizations if they could identify the applicant **8. Dates (When Identifying):** - Birth dates - Specific graduation dates if they strongly imply age - Dates of employment if they reveal age or identity **9. Unique Identifiers:** - National ID numbers - Passport numbers - Social Security Numbers - Student ID numbers - Application reference numbers (unless required for administrative purposes) **CRITICAL: What to PRESERVE:** The goal is to retain ALL information that demonstrates: - **Academic qualifications and achievements** (without institution names) - Degree types and classifications (e.g., "BSc in Computer Science with First Class Honours") - Grades, GPA, marks (e.g., "Achieved 85% average") - Academic skills and knowledge areas - Research methodologies and techniques - **Research experience and interests** - Research topics and areas of expertise - Methodologies and technical skills - Research questions and hypotheses - Theoretical frameworks - Data analysis capabilities - **Technical skills and competencies** - Programming languages - Software and tools - Laboratory techniques - Analytical methods - **Professional experience and achievements** (without employer names) - Types of roles and responsibilities (e.g., "Research assistant role") - Project descriptions and outcomes - Skills developed - Impact and contributions - **Motivation and passion** - Interest in the research area - Career goals and aspirations - Reasons for applying - Alignment with the PhD position - **Publications and outputs** (as described above) - Number and types of publications - Journal/conference names - General research areas - **Academic achievements** - Scholarships (type, not name if identifying) - Awards (if not uniquely identifying) - Academic honors and recognitions **Your Step-by-Step Process:** 1. Load the application: `load_pdf("{pdf_path}")` 2. Read through the ENTIRE document carefully 3. For each piece of identifying information found (according to categories above): - Use `redact_text("{pdf_path}", "exact text to redact")` for each item - Redact ALL instances across ALL pages 4. After redacting all identifying information, save: `save_redacted_pdf("{pdf_path}")` 5. Provide a comprehensive summary including: - Total number of redactions made - Categories of information redacted (with counts) - Key qualifications and strengths that were preserved - Any challenges or edge cases encountered **Important Reminders:** - Applications containing identifying data that cannot be fully redacted should be flagged - Be thorough but efficient - every instance must be caught - Maintain document readability and coherence - Preserve the applicant's voice and enthusiasm where possible - Focus on merit-based content: skills, achievements, qualifications, passion, and fit **The Goal:** Create a fully anonymized application that allows reviewers to assess the applicant's suitability for the PhD position based purely on their academic merit, research potential, technical capabilities, and motivation - without any knowledge of their identity or protected characteristics. Begin by loading and reviewing the application document.""" return prompt_text def main(): """Main function to run the MCP server.""" mcp.run() # Entry point for running the server if __name__ == "__main__": main()

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/marc-hanheide/redact_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server