"""
Path validation for LLM-generated paths.
Prevents hallucinated paths by validating against actual filesystem.
Provides helpful hints for LLM to fix incorrect paths.
"""
from pathlib import Path
from typing import Optional, List, Tuple, Set
from dataclasses import dataclass, field
import os
@dataclass
class PathValidationResult:
"""Rich result from path validation with hints for LLM."""
path: str # Original path
is_valid: bool # Whether path exists
fixed_path: Optional[str] = None # Corrected path (if auto-fixable)
is_directory: bool = False # True if path is a directory (not a file)
similar_paths: List[str] = field(default_factory=list) # Similar existing paths
files_in_dir: List[str] = field(default_factory=list) # If directory, files inside
error: Optional[str] = None # Error description
def get_hint_for_llm(self) -> str:
"""Generate a helpful hint string for LLM to fix the path."""
hints = []
if self.is_valid:
if self.fixed_path and self.fixed_path != self.path:
return f"Path '{self.path}' auto-corrected to '{self.fixed_path}'"
return f"Path '{self.path}' is valid"
hints.append(f"Path '{self.path}' does not exist.")
if self.is_directory and self.files_in_dir:
hints.append(f"'{self.path}' is a DIRECTORY, not a file.")
hints.append(f"Files in this directory: {', '.join(self.files_in_dir[:5])}")
if len(self.files_in_dir) > 5:
hints.append(f"... and {len(self.files_in_dir) - 5} more files")
if self.similar_paths:
hints.append(f"Similar existing paths: {', '.join(self.similar_paths[:5])}")
if not self.similar_paths and not self.files_in_dir:
hints.append("No similar paths found in repository.")
return " ".join(hints)
class PathValidator:
"""Validate and fix paths from LLM."""
# Directories to skip when building cache
SKIP_DIRS = {
'.git', 'node_modules', '__pycache__', 'venv', '.venv',
'env', '.env', '.idea', '.vscode', '.mypy_cache',
'__pypackages__', '.pytest_cache', '.tox', 'dist', 'build',
'htmlcov', '.coverage', 'eggs', '*.egg-info',
}
def __init__(self, repo_path: str, max_files: int = 50000):
"""
Initialize validator.
Args:
repo_path: Path to repository root
max_files: Maximum files to cache (for very large repos)
"""
self.repo_path = Path(repo_path).resolve()
self.max_files = max_files
self._file_cache: Optional[Set[str]] = None
self._dir_cache: Optional[Set[str]] = None
self._lower_to_actual: Optional[dict] = None
def _build_cache(self) -> None:
"""Build cache of all files and directories."""
if self._file_cache is not None:
return
self._file_cache = set()
self._dir_cache = set()
self._lower_to_actual = {}
file_count = 0
for root, dirs, files in os.walk(self.repo_path):
# Skip hidden and common ignore dirs
dirs[:] = [d for d in dirs
if not d.startswith('.') and d not in self.SKIP_DIRS]
rel_root = os.path.relpath(root, self.repo_path)
if rel_root != '.':
normalized = rel_root.replace('\\', '/')
self._dir_cache.add(normalized)
self._lower_to_actual[normalized.lower()] = normalized
for f in files:
if file_count >= self.max_files:
return
rel_path = os.path.join(rel_root, f).replace('\\', '/')
if rel_path.startswith('./'):
rel_path = rel_path[2:]
self._file_cache.add(rel_path)
self._lower_to_actual[rel_path.lower()] = rel_path
file_count += 1
def normalize_path(self, path: str) -> str:
"""
Normalize a path string.
Handles:
- Leading ./
- Backslashes
- Double slashes
- Trailing slashes (always removed for cache lookup)
"""
if not path:
return ""
# Remove leading ./
while path.startswith('./'):
path = path[2:]
# Convert backslash to forward slash
path = path.replace('\\', '/')
# Remove double slashes
while '//' in path:
path = path.replace('//', '/')
# Remove leading slash (make relative)
if path.startswith('/'):
path = path[1:]
# Remove trailing slash (for cache lookup consistency)
path = path.rstrip('/')
return path.strip()
def exists(self, path: str) -> bool:
"""Check if path exists in repo."""
self._build_cache()
normalized = self.normalize_path(path)
return normalized in self._file_cache or normalized in self._dir_cache
def validate_path(self, path: str) -> Tuple[bool, Optional[str], Optional[str]]:
"""
Validate a path and try to fix if invalid.
Returns:
Tuple of (is_valid, fixed_path, error_message)
- is_valid: True if path exists (after potential fix)
- fixed_path: The validated/fixed path, or None if completely invalid
- error_message: Description of the issue, or None if valid
"""
self._build_cache()
normalized = self.normalize_path(path)
if not normalized:
return (False, None, "Empty path")
# Check exact match
if normalized in self._file_cache or normalized in self._dir_cache:
return (True, normalized, None)
# Try case-insensitive match
lower_path = normalized.lower()
if lower_path in self._lower_to_actual:
actual = self._lower_to_actual[lower_path]
return (True, actual, f"Fixed case: {path} -> {actual}")
# Try to find by filename only
filename = normalized.split('/')[-1]
matches = self._find_by_filename(filename)
if len(matches) == 1:
return (True, matches[0], f"Found at different location: {matches[0]}")
elif len(matches) > 1:
return (False, matches[0], f"Ambiguous: found {len(matches)} files named '{filename}'")
# Try fuzzy match (find similar paths)
similar = self._find_similar(normalized)
if similar:
return (False, similar[0], f"Path not found. Similar: {similar[0]}")
return (False, None, f"Path does not exist: {path}")
def _find_by_filename(self, filename: str) -> List[str]:
"""Find all paths with the given filename."""
results = []
filename_lower = filename.lower()
for cached in self._file_cache:
cached_filename = cached.split('/')[-1]
if cached_filename.lower() == filename_lower:
results.append(cached)
return results
def _find_similar(self, path: str, max_results: int = 3) -> List[str]:
"""Find similar paths using simple heuristics."""
results = []
# Extract components
parts = path.split('/')
filename = parts[-1]
parent = parts[-2] if len(parts) > 1 else ""
# Search for files with similar name
filename_lower = filename.lower()
for cached in self._file_cache:
cached_parts = cached.split('/')
cached_filename = cached_parts[-1].lower()
# Exact filename in different location
if cached_filename == filename_lower:
results.append(cached)
continue
# Partial match (contains)
if len(filename_lower) >= 5:
if filename_lower in cached_filename or cached_filename in filename_lower:
results.append(cached)
continue
# Same parent directory
if parent and len(cached_parts) > 1:
if cached_parts[-2].lower() == parent.lower():
if self._string_similarity(cached_filename, filename_lower) > 0.6:
results.append(cached)
# Dedupe and limit
seen = set()
unique = []
for r in results:
if r not in seen:
seen.add(r)
unique.append(r)
return unique[:max_results]
@staticmethod
def _string_similarity(a: str, b: str) -> float:
"""Simple string similarity (Jaccard on character sets)."""
if not a or not b:
return 0.0
set_a = set(a.lower())
set_b = set(b.lower())
intersection = len(set_a & set_b)
union = len(set_a | set_b)
return intersection / union if union > 0 else 0.0
def is_directory(self, path: str) -> bool:
"""Check if path is a directory."""
self._build_cache()
normalized = self.normalize_path(path)
# Explicit directory indicators
if normalized.endswith('/'):
return True
# Check cache
if normalized in self._dir_cache:
return True
# In file cache = not a directory
if normalized in self._file_cache:
return False
# No file extension usually means directory
last_part = normalized.split('/')[-1] if normalized else ""
if last_part and '.' not in last_part:
return True
return False
def validate_likely_files(
self,
paths: List[str],
) -> Tuple[List[str], List[str], List[str]]:
"""
Validate a list of likely_files from LLM.
Returns:
Tuple of (valid_paths, fixed_paths, invalid_paths)
"""
valid = []
fixed = []
invalid = []
for path in paths:
is_valid, fixed_path, error = self.validate_path(path)
if is_valid and fixed_path:
if error: # Was fixed
fixed.append(fixed_path)
else:
valid.append(fixed_path)
else:
invalid.append(path)
return valid, fixed, invalid
def get_files_in_directory(self, dir_path: str, max_files: int = 20) -> List[str]:
"""Get files in a directory path."""
self._build_cache()
normalized = self.normalize_path(dir_path).rstrip('/') + '/'
files = []
for cached in self._file_cache:
if cached.startswith(normalized):
# Only direct children
remainder = cached[len(normalized):]
if '/' not in remainder:
files.append(cached)
return sorted(files)[:max_files]
def validate_path_rich(self, path: str) -> PathValidationResult:
"""
Validate path and return rich result with hints for LLM.
This is the preferred method - provides suggestions instead of just pass/fail.
"""
self._build_cache()
normalized = self.normalize_path(path)
if not normalized:
return PathValidationResult(
path=path,
is_valid=False,
error="Empty path"
)
# Check if it's a directory first
is_dir = self.is_directory(normalized)
# Exact match - valid
if normalized in self._file_cache:
return PathValidationResult(
path=path,
is_valid=True,
fixed_path=normalized,
is_directory=False,
)
# It's a directory - provide files inside
if normalized in self._dir_cache or is_dir:
files = self.get_files_in_directory(normalized, max_files=10)
return PathValidationResult(
path=path,
is_valid=False, # Invalid for read_file (it's a dir!)
is_directory=True,
files_in_dir=files,
error=f"'{normalized}' is a directory, not a file"
)
# Try case-insensitive match
lower_path = normalized.lower()
if lower_path in self._lower_to_actual:
actual = self._lower_to_actual[lower_path]
is_dir_match = actual in self._dir_cache
if is_dir_match:
files = self.get_files_in_directory(actual, max_files=10)
return PathValidationResult(
path=path,
is_valid=False,
fixed_path=actual,
is_directory=True,
files_in_dir=files,
error=f"'{actual}' is a directory (case-fixed from '{path}')"
)
return PathValidationResult(
path=path,
is_valid=True,
fixed_path=actual,
is_directory=False,
error=f"Case-fixed: {path} -> {actual}"
)
# Try to find by filename only
filename = normalized.split('/')[-1]
matches = self._find_by_filename(filename)
if len(matches) == 1:
return PathValidationResult(
path=path,
is_valid=True,
fixed_path=matches[0],
is_directory=False,
similar_paths=matches,
error=f"Found at different location: {matches[0]}"
)
elif len(matches) > 1:
return PathValidationResult(
path=path,
is_valid=False,
similar_paths=matches[:5],
error=f"Ambiguous: found {len(matches)} files named '{filename}'"
)
# Find similar paths as hints
similar = self._find_similar(normalized, max_results=5)
return PathValidationResult(
path=path,
is_valid=False,
similar_paths=similar,
error=f"Path does not exist: {path}"
)
def get_hints_for_paths(self, paths: List[str]) -> List[PathValidationResult]:
"""
Validate multiple paths and return rich results.
Useful for getting hints for all paths in a search plan.
"""
return [self.validate_path_rich(p) for p in paths]
def get_llm_hints_summary(self, invalid_results: List[PathValidationResult]) -> str:
"""
Generate a summary of hints for LLM to fix multiple paths.
Args:
invalid_results: List of PathValidationResult with is_valid=False
Returns:
Formatted string for including in LLM prompt
"""
if not invalid_results:
return ""
lines = ["The following paths need correction:"]
for result in invalid_results:
lines.append(f"\n- {result.get_hint_for_llm()}")
lines.append("\nPlease provide corrected paths based on the hints above.")
return "\n".join(lines)