Skip to main content
Glama

find-related-pdfs

Locate a PDF and identify related documents by analyzing content patterns and common substrings within a specified directory.

Instructions

Find a PDF and then search for related PDFs based on its content, including common substring patterns

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
base_pathYesBase directory to search in
target_filenameYesName of the initial PDF to analyze
pattern_matching_onlyNoOnly search for repeating substring patterns
min_pattern_occurrencesNoMinimum times a pattern must appear to be considered significant

Implementation Reference

  • Handler for 'find-related-pdfs' tool: finds target PDF, extracts text, derives search terms/patterns from content, searches for matching PDFs by filename patterns or substrings.
    elif name == "find-related-pdfs":
        base_path = arguments.get("base_path")
        target_filename = arguments.get("target_filename")
        
        if not base_path or not target_filename:
            raise ValueError("Missing required arguments")
    
        try:
            # First, find the target PDF
            target_pdf_path = None
            for root, _, files in os.walk(base_path):
                for filename in files:
                    if target_filename.lower() in filename.lower() and filename.lower().endswith('.pdf'):
                        target_pdf_path = os.path.join(root, filename)
                        break
                if target_pdf_path:
                    break
    
            if not target_pdf_path:
                return [types.TextContent(
                    type="text",
                    text=f"Could not find target PDF: {target_filename}"
                )]
    
            # Extract text from the target PDF
            reader = PdfReader(target_pdf_path)
            extracted_text = ""
            for page in reader.pages:
                extracted_text += page.extract_text() + "\n"
    
            # Process the extracted text
            search_terms = set()
            
            if arguments.get("pattern_matching_only", False):
                # Find common patterns that look like related filenames
                # Look for 2-3 letter prefixes followed by numbers and optional suffix
                pattern_regex = re.compile(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', re.IGNORECASE)
                potential_parts = pattern_regex.findall(extracted_text)
                
                # Count occurrences of each prefix
                prefix_counts = {}
                prefix_patterns = {}
                for part in potential_parts:
                    # Extract prefix (2-3 letters at start)
                    prefix_match = re.match(r'([A-Z]{2,3})', part, re.IGNORECASE)
                    if prefix_match:
                        prefix = prefix_match.group(1).upper()
                        if prefix not in prefix_counts:
                            prefix_counts[prefix] = 1
                            prefix_patterns[prefix] = set()
                        else:
                            prefix_counts[prefix] += 1
                        prefix_patterns[prefix].add(part)
                
                # Only keep prefixes that appear frequently enough
                min_occurrences = arguments.get("min_pattern_occurrences", 2)
                common_prefixes = {prefix for prefix, count in prefix_counts.items() 
                                if count >= min_occurrences}
                
                # Add patterns for common prefixes
                for prefix in common_prefixes:
                    search_terms.update(prefix_patterns[prefix])
                    # Also add the prefix itself to catch related files
                    search_terms.add(prefix)
            else:
                # Original word-based logic
                words = re.findall(r'\b\w+\b', extracted_text)
                search_terms.update(words)
                
                # Add pattern matching on top of word-based search
                pattern_regex = re.compile(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', re.IGNORECASE)
                potential_parts = pattern_regex.findall(extracted_text)
                search_terms.update(potential_parts)
    
            # Remove common words and very short terms
            common_words = {'THE', 'AND', 'OR', 'IN', 'ON', 'AT', 'TO', 'FOR', 'WITH', 'BY'}
            search_terms = {term for term in search_terms if 
                        len(term) > 2 and 
                        term.upper() not in common_words}
    
            # Search for PDFs matching any of the search terms
            found_files = set()
            for root, _, files in os.walk(base_path):
                for filename in files:
                    if filename.lower().endswith('.pdf'):
                        file_lower = filename.lower()
                        
                        # Extract potential matches from filename
                        file_parts = re.findall(r'([A-Z]{2,3}\d{3,7}(?:[-_][A-Z0-9]+)?)', filename, re.IGNORECASE)
                        
                        for term in search_terms:
                            term_lower = term.lower()
                            if (term_lower in file_lower or 
                                any(part.lower() == term_lower for part in file_parts)):
                                full_path = os.path.join(root, filename)
                                found_files.add((full_path, term))
                                break
    
            # Format the results
            if found_files:
                result_text = f"Extracted {len(search_terms)} search terms from {os.path.basename(target_pdf_path)}.\n"
                result_text += "Search terms found: " + ", ".join(sorted(search_terms)) + "\n\n"
                result_text += "Found related PDFs:\n"
                
                # Group files by matching term
                term_groups = {}
                for file_path, term in found_files:
                    if term not in term_groups:
                        term_groups[term] = []
                    term_groups[term].append(file_path)
    
                # Output grouped results
                for term in sorted(term_groups.keys()):
                    result_text += f"\nFiles matching '{term}':\n"
                    for file_path in sorted(term_groups[term]):
                        result_text += f"- {file_path}\n"
    
                return [types.TextContent(
                    type="text",
                    text=result_text
                )]
            else:
                return [types.TextContent(
                    type="text",
                    text=f"No related PDFs found for terms extracted from {os.path.basename(target_pdf_path)}"
                )]
    
        except Exception as e:
            return [types.TextContent(
                type="text",
                text=f"Error processing PDFs: {str(e)}"
            )]
  • Registration of the 'find-related-pdfs' tool in list_tools(), including name, description, and input schema definition.
    types.Tool(
        name="find-related-pdfs",
        description="Find a PDF and then search for related PDFs based on its content, including common substring patterns",
        inputSchema={
            "type": "object",
            "properties": {
                "base_path": {
                    "type": "string",
                    "description": "Base directory to search in"
                },
                "target_filename": {
                    "type": "string",
                    "description": "Name of the initial PDF to analyze"
                },
                "pattern_matching_only": {
                    "type": "boolean", 
                    "description": "Only search for repeating substring patterns",
                    "default": False
                },
                "min_pattern_occurrences": {
                    "type": "integer",
                    "description": "Minimum times a pattern must appear to be considered significant",
                    "default": 2
                }
            },
            "required": ["base_path", "target_filename"]
        }
    )

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hanweg/mcp-pdf-tools'

If you have feedback or need assistance with the MCP directory API, please join our Discord server