Semantic Search MCP Server

Overview Schema Related Servers Score Discussions

path_validator.py•15.4 KiB

""" Path validation for LLM-generated paths. Prevents hallucinated paths by validating against actual filesystem. Provides helpful hints for LLM to fix incorrect paths. """ from pathlib import Path from typing import Optional, List, Tuple, Set from dataclasses import dataclass, field import os @dataclass class PathValidationResult: """Rich result from path validation with hints for LLM.""" path: str # Original path is_valid: bool # Whether path exists fixed_path: Optional[str] = None # Corrected path (if auto-fixable) is_directory: bool = False # True if path is a directory (not a file) similar_paths: List[str] = field(default_factory=list) # Similar existing paths files_in_dir: List[str] = field(default_factory=list) # If directory, files inside error: Optional[str] = None # Error description def get_hint_for_llm(self) -> str: """Generate a helpful hint string for LLM to fix the path.""" hints = [] if self.is_valid: if self.fixed_path and self.fixed_path != self.path: return f"Path '{self.path}' auto-corrected to '{self.fixed_path}'" return f"Path '{self.path}' is valid" hints.append(f"Path '{self.path}' does not exist.") if self.is_directory and self.files_in_dir: hints.append(f"'{self.path}' is a DIRECTORY, not a file.") hints.append(f"Files in this directory: {', '.join(self.files_in_dir[:5])}") if len(self.files_in_dir) > 5: hints.append(f"... and {len(self.files_in_dir) - 5} more files") if self.similar_paths: hints.append(f"Similar existing paths: {', '.join(self.similar_paths[:5])}") if not self.similar_paths and not self.files_in_dir: hints.append("No similar paths found in repository.") return " ".join(hints) class PathValidator: """Validate and fix paths from LLM.""" # Directories to skip when building cache SKIP_DIRS = { '.git', 'node_modules', '__pycache__', 'venv', '.venv', 'env', '.env', '.idea', '.vscode', '.mypy_cache', '__pypackages__', '.pytest_cache', '.tox', 'dist', 'build', 'htmlcov', '.coverage', 'eggs', '*.egg-info', } def __init__(self, repo_path: str, max_files: int = 50000): """ Initialize validator. Args: repo_path: Path to repository root max_files: Maximum files to cache (for very large repos) """ self.repo_path = Path(repo_path).resolve() self.max_files = max_files self._file_cache: Optional[Set[str]] = None self._dir_cache: Optional[Set[str]] = None self._lower_to_actual: Optional[dict] = None def _build_cache(self) -> None: """Build cache of all files and directories.""" if self._file_cache is not None: return self._file_cache = set() self._dir_cache = set() self._lower_to_actual = {} file_count = 0 for root, dirs, files in os.walk(self.repo_path): # Skip hidden and common ignore dirs dirs[:] = [d for d in dirs if not d.startswith('.') and d not in self.SKIP_DIRS] rel_root = os.path.relpath(root, self.repo_path) if rel_root != '.': normalized = rel_root.replace('\\', '/') self._dir_cache.add(normalized) self._lower_to_actual[normalized.lower()] = normalized for f in files: if file_count >= self.max_files: return rel_path = os.path.join(rel_root, f).replace('\\', '/') if rel_path.startswith('./'): rel_path = rel_path[2:] self._file_cache.add(rel_path) self._lower_to_actual[rel_path.lower()] = rel_path file_count += 1 def normalize_path(self, path: str) -> str: """ Normalize a path string. Handles: - Leading ./ - Backslashes - Double slashes - Trailing slashes (always removed for cache lookup) """ if not path: return "" # Remove leading ./ while path.startswith('./'): path = path[2:] # Convert backslash to forward slash path = path.replace('\\', '/') # Remove double slashes while '//' in path: path = path.replace('//', '/') # Remove leading slash (make relative) if path.startswith('/'): path = path[1:] # Remove trailing slash (for cache lookup consistency) path = path.rstrip('/') return path.strip() def exists(self, path: str) -> bool: """Check if path exists in repo.""" self._build_cache() normalized = self.normalize_path(path) return normalized in self._file_cache or normalized in self._dir_cache def validate_path(self, path: str) -> Tuple[bool, Optional[str], Optional[str]]: """ Validate a path and try to fix if invalid. Returns: Tuple of (is_valid, fixed_path, error_message) - is_valid: True if path exists (after potential fix) - fixed_path: The validated/fixed path, or None if completely invalid - error_message: Description of the issue, or None if valid """ self._build_cache() normalized = self.normalize_path(path) if not normalized: return (False, None, "Empty path") # Check exact match if normalized in self._file_cache or normalized in self._dir_cache: return (True, normalized, None) # Try case-insensitive match lower_path = normalized.lower() if lower_path in self._lower_to_actual: actual = self._lower_to_actual[lower_path] return (True, actual, f"Fixed case: {path} -> {actual}") # Try to find by filename only filename = normalized.split('/')[-1] matches = self._find_by_filename(filename) if len(matches) == 1: return (True, matches[0], f"Found at different location: {matches[0]}") elif len(matches) > 1: return (False, matches[0], f"Ambiguous: found {len(matches)} files named '{filename}'") # Try fuzzy match (find similar paths) similar = self._find_similar(normalized) if similar: return (False, similar[0], f"Path not found. Similar: {similar[0]}") return (False, None, f"Path does not exist: {path}") def _find_by_filename(self, filename: str) -> List[str]: """Find all paths with the given filename.""" results = [] filename_lower = filename.lower() for cached in self._file_cache: cached_filename = cached.split('/')[-1] if cached_filename.lower() == filename_lower: results.append(cached) return results def _find_similar(self, path: str, max_results: int = 3) -> List[str]: """Find similar paths using simple heuristics.""" results = [] # Extract components parts = path.split('/') filename = parts[-1] parent = parts[-2] if len(parts) > 1 else "" # Search for files with similar name filename_lower = filename.lower() for cached in self._file_cache: cached_parts = cached.split('/') cached_filename = cached_parts[-1].lower() # Exact filename in different location if cached_filename == filename_lower: results.append(cached) continue # Partial match (contains) if len(filename_lower) >= 5: if filename_lower in cached_filename or cached_filename in filename_lower: results.append(cached) continue # Same parent directory if parent and len(cached_parts) > 1: if cached_parts[-2].lower() == parent.lower(): if self._string_similarity(cached_filename, filename_lower) > 0.6: results.append(cached) # Dedupe and limit seen = set() unique = [] for r in results: if r not in seen: seen.add(r) unique.append(r) return unique[:max_results] @staticmethod def _string_similarity(a: str, b: str) -> float: """Simple string similarity (Jaccard on character sets).""" if not a or not b: return 0.0 set_a = set(a.lower()) set_b = set(b.lower()) intersection = len(set_a & set_b) union = len(set_a | set_b) return intersection / union if union > 0 else 0.0 def is_directory(self, path: str) -> bool: """Check if path is a directory.""" self._build_cache() normalized = self.normalize_path(path) # Explicit directory indicators if normalized.endswith('/'): return True # Check cache if normalized in self._dir_cache: return True # In file cache = not a directory if normalized in self._file_cache: return False # No file extension usually means directory last_part = normalized.split('/')[-1] if normalized else "" if last_part and '.' not in last_part: return True return False def validate_likely_files( self, paths: List[str], ) -> Tuple[List[str], List[str], List[str]]: """ Validate a list of likely_files from LLM. Returns: Tuple of (valid_paths, fixed_paths, invalid_paths) """ valid = [] fixed = [] invalid = [] for path in paths: is_valid, fixed_path, error = self.validate_path(path) if is_valid and fixed_path: if error: # Was fixed fixed.append(fixed_path) else: valid.append(fixed_path) else: invalid.append(path) return valid, fixed, invalid def get_files_in_directory(self, dir_path: str, max_files: int = 20) -> List[str]: """Get files in a directory path.""" self._build_cache() normalized = self.normalize_path(dir_path).rstrip('/') + '/' files = [] for cached in self._file_cache: if cached.startswith(normalized): # Only direct children remainder = cached[len(normalized):] if '/' not in remainder: files.append(cached) return sorted(files)[:max_files] def validate_path_rich(self, path: str) -> PathValidationResult: """ Validate path and return rich result with hints for LLM. This is the preferred method - provides suggestions instead of just pass/fail. """ self._build_cache() normalized = self.normalize_path(path) if not normalized: return PathValidationResult( path=path, is_valid=False, error="Empty path" ) # Check if it's a directory first is_dir = self.is_directory(normalized) # Exact match - valid if normalized in self._file_cache: return PathValidationResult( path=path, is_valid=True, fixed_path=normalized, is_directory=False, ) # It's a directory - provide files inside if normalized in self._dir_cache or is_dir: files = self.get_files_in_directory(normalized, max_files=10) return PathValidationResult( path=path, is_valid=False, # Invalid for read_file (it's a dir!) is_directory=True, files_in_dir=files, error=f"'{normalized}' is a directory, not a file" ) # Try case-insensitive match lower_path = normalized.lower() if lower_path in self._lower_to_actual: actual = self._lower_to_actual[lower_path] is_dir_match = actual in self._dir_cache if is_dir_match: files = self.get_files_in_directory(actual, max_files=10) return PathValidationResult( path=path, is_valid=False, fixed_path=actual, is_directory=True, files_in_dir=files, error=f"'{actual}' is a directory (case-fixed from '{path}')" ) return PathValidationResult( path=path, is_valid=True, fixed_path=actual, is_directory=False, error=f"Case-fixed: {path} -> {actual}" ) # Try to find by filename only filename = normalized.split('/')[-1] matches = self._find_by_filename(filename) if len(matches) == 1: return PathValidationResult( path=path, is_valid=True, fixed_path=matches[0], is_directory=False, similar_paths=matches, error=f"Found at different location: {matches[0]}" ) elif len(matches) > 1: return PathValidationResult( path=path, is_valid=False, similar_paths=matches[:5], error=f"Ambiguous: found {len(matches)} files named '{filename}'" ) # Find similar paths as hints similar = self._find_similar(normalized, max_results=5) return PathValidationResult( path=path, is_valid=False, similar_paths=similar, error=f"Path does not exist: {path}" ) def get_hints_for_paths(self, paths: List[str]) -> List[PathValidationResult]: """ Validate multiple paths and return rich results. Useful for getting hints for all paths in a search plan. """ return [self.validate_path_rich(p) for p in paths] def get_llm_hints_summary(self, invalid_results: List[PathValidationResult]) -> str: """ Generate a summary of hints for LLM to fix multiple paths. Args: invalid_results: List of PathValidationResult with is_valid=False Returns: Formatted string for including in LLM prompt """ if not invalid_results: return "" lines = ["The following paths need correction:"] for result in invalid_results: lines.append(f"\n- {result.get_hint_for_llm()}") lines.append("\nPlease provide corrected paths based on the hints above.") return "\n".join(lines)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mix0z/Semantic-Search-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

path_validator.py•15.4 KiB