search_text
Locate specific text patterns within PDF files using regular expressions. Specify search range, case sensitivity, and retrieve matches with page numbers and context for precise results.
Instructions
Search for text pattern in a PDF file
Args:
pdf_path: Path to the PDF file
pattern: Regular expression pattern to search for
case_sensitive: Whether to perform case-sensitive matching
start_page: Page number to start search (0-indexed). If None, starts from first page.
end_page: Page number to end search (0-indexed, inclusive). If None, searches all pages.
Returns:
List of matches with page number, match text, and context
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| case_sensitive | No | ||
| end_page | No | ||
| pattern | Yes | ||
| pdf_path | Yes | ||
| start_page | No |
Implementation Reference
- mcp_pdf_forms/server.py:306-386 (handler)The @mcp.tool()-decorated function that implements the search_text tool. This serves as the handler executing the PDF text search logic using PyMuPDF (fitz), regular expressions, and provides matches with context. The decorator handles registration and schema inference from type annotations.@mcp.tool() def search_text(pdf_path: str, pattern: str, case_sensitive: bool = False, start_page: Optional[int] = None, end_page: Optional[int] = None) -> List[Dict[str, Any]]: """ Search for text pattern in a PDF file Args: pdf_path: Path to the PDF file pattern: Regular expression pattern to search for case_sensitive: Whether to perform case-sensitive matching start_page: Page number to start search (0-indexed). If None, starts from first page. end_page: Page number to end search (0-indexed, inclusive). If None, searches all pages. Returns: List of matches with page number, match text, and context """ try: doc = fitz.open(pdf_path) total_pages = len(doc) # Validate page parameters if start_page is not None and (start_page < 0 or start_page >= total_pages): raise ValueError(f"Start page {start_page} is out of range (0-{total_pages-1})") if end_page is not None and (end_page < 0 or end_page >= total_pages): raise ValueError(f"End page {end_page} is out of range (0-{total_pages-1})") # Set defaults if parameters are None if start_page is None: start_page = 0 if end_page is None: end_page = total_pages - 1 # Ensure start_page <= end_page if start_page > end_page: start_page, end_page = end_page, start_page # Compile regex pattern flags = 0 if case_sensitive else re.IGNORECASE regex = re.compile(pattern, flags) # List to store matches matches = [] # Character context window context_size = 50 # Search pages for page_num in range(start_page, end_page + 1): page = doc[page_num] text = page.get_text() # Find all matches in the page text for match in regex.finditer(text): start_pos = match.start() end_pos = match.end() match_text = match.group() # Extract context around match context_start = max(0, start_pos - context_size) context_end = min(len(text), end_pos + context_size) # Get text before and after match before = text[context_start:start_pos] after = text[end_pos:context_end] # Add match information to results matches.append({ "page": page_num, "match": match_text, "context": f"...{before}{match_text}{after}...", "position": { "start": start_pos, "end": end_pos } }) doc.close() return matches except Exception as e: raise Exception(f"Error searching text: {str(e)}")