Skip to main content
Glama

find-related-pdfs

Locate and analyze PDF files based on content similarity, using repeating substring patterns or text analysis, to identify related documents within a specified directory.

Instructions

Find a PDF and then search for related PDFs based on its content, including common substring patterns

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
base_pathYesBase directory to search in
min_pattern_occurrencesNoMinimum times a pattern must appear to be considered significant
pattern_matching_onlyNoOnly search for repeating substring patterns
target_filenameYesName of the initial PDF to analyze

Implementation Reference

  • Core implementation of the 'find-related-pdfs' tool handler. Locates target PDF, extracts text, derives search terms or patterns from content (supporting pattern_matching_only and min_pattern_occurrences), searches directory for matching PDFs in filenames, groups results by matching term, and returns formatted text output.
    elif name == "find-related-pdfs": base_path = arguments.get("base_path") target_filename = arguments.get("target_filename") if not base_path or not target_filename: raise ValueError("Missing required arguments") try: # First, find the target PDF target_pdf_path = None for root, _, files in os.walk(base_path): for filename in files: if target_filename.lower() in filename.lower() and filename.lower().endswith('.pdf'): target_pdf_path = os.path.join(root, filename) break if target_pdf_path: break if not target_pdf_path: return [types.TextContent( type="text", text=f"Could not find target PDF: {target_filename}" )] # Extract text from the target PDF reader = PdfReader(target_pdf_path) extracted_text = "" for page in reader.pages: extracted_text += page.extract_text() + "\n" # Process the extracted text search_terms = set() if arguments.get("pattern_matching_only", False): # Find common patterns that look like related filenames # Look for 2-3 letter prefixes followed by numbers and optional suffix pattern_regex = re.compile(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', re.IGNORECASE) potential_parts = pattern_regex.findall(extracted_text) # Count occurrences of each prefix prefix_counts = {} prefix_patterns = {} for part in potential_parts: # Extract prefix (2-3 letters at start) prefix_match = re.match(r'([A-Z]{2,3})', part, re.IGNORECASE) if prefix_match: prefix = prefix_match.group(1).upper() if prefix not in prefix_counts: prefix_counts[prefix] = 1 prefix_patterns[prefix] = set() else: prefix_counts[prefix] += 1 prefix_patterns[prefix].add(part) # Only keep prefixes that appear frequently enough min_occurrences = arguments.get("min_pattern_occurrences", 2) common_prefixes = {prefix for prefix, count in prefix_counts.items() if count >= min_occurrences} # Add patterns for common prefixes for prefix in common_prefixes: search_terms.update(prefix_patterns[prefix]) # Also add the prefix itself to catch related files search_terms.add(prefix) else: # Original word-based logic words = re.findall(r'\b\w+\b', extracted_text) search_terms.update(words) # Add pattern matching on top of word-based search pattern_regex = re.compile(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', re.IGNORECASE) potential_parts = pattern_regex.findall(extracted_text) search_terms.update(potential_parts) # Remove common words and very short terms common_words = {'THE', 'AND', 'OR', 'IN', 'ON', 'AT', 'TO', 'FOR', 'WITH', 'BY'} search_terms = {term for term in search_terms if len(term) > 2 and term.upper() not in common_words} # Search for PDFs matching any of the search terms found_files = set() for root, _, files in os.walk(base_path): for filename in files: if filename.lower().endswith('.pdf'): file_lower = filename.lower() # Extract potential matches from filename file_parts = re.findall(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', filename, re.IGNORECASE) for term in search_terms: term_lower = term.lower() if (term_lower in file_lower or any(part.lower() == term_lower for part in file_parts)): full_path = os.path.join(root, filename) found_files.add((full_path, term)) break # Format the results if found_files: result_text = f"Extracted {len(search_terms)} search terms from {os.path.basename(target_pdf_path)}.\n" result_text += "Search terms found: " + ", ".join(sorted(search_terms)) + "\n\n" result_text += "Found related PDFs:\n" # Group files by matching term term_groups = {} for file_path, term in found_files: if term not in term_groups: term_groups[term] = [] term_groups[term].append(file_path) # Output grouped results for term in sorted(term_groups.keys()): result_text += f"\nFiles matching '{term}':\n" for file_path in sorted(term_groups[term]): result_text += f"- {file_path}\n" return [types.TextContent( type="text", text=result_text )] else: return [types.TextContent( type="text", text=f"No related PDFs found for terms extracted from {os.path.basename(target_pdf_path)}" )] except Exception as e: return [types.TextContent( type="text", text=f"Error processing PDFs: {str(e)}" )]
  • Input schema defining parameters for the tool: base_path (required), target_filename (required), pattern_matching_only (optional boolean), min_pattern_occurrences (optional integer).
    inputSchema={ "type": "object", "properties": { "base_path": { "type": "string", "description": "Base directory to search in" }, "target_filename": { "type": "string", "description": "Name of the initial PDF to analyze" }, "pattern_matching_only": { "type": "boolean", "description": "Only search for repeating substring patterns", "default": False }, "min_pattern_occurrences": { "type": "integer", "description": "Minimum times a pattern must appear to be considered significant", "default": 2 } }, "required": ["base_path", "target_filename"] }
  • Registration of the 'find-related-pdfs' tool in the server's list_tools method, including name, description, and full input schema.
    types.Tool( name="find-related-pdfs", description="Find a PDF and then search for related PDFs based on its content, including common substring patterns", inputSchema={ "type": "object", "properties": { "base_path": { "type": "string", "description": "Base directory to search in" }, "target_filename": { "type": "string", "description": "Name of the initial PDF to analyze" }, "pattern_matching_only": { "type": "boolean", "description": "Only search for repeating substring patterns", "default": False }, "min_pattern_occurrences": { "type": "integer", "description": "Minimum times a pattern must appear to be considered significant", "default": 2 } }, "required": ["base_path", "target_filename"] } )

Other Tools

Related Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hanweg/mcp-pdf-tools'

If you have feedback or need assistance with the MCP directory API, please join our Discord server