"""
Pattern validation for grep patterns.
Prevents too-broad patterns that match everything.
"""
import re
from typing import Optional, Dict, Any, List, Literal
from dataclasses import dataclass
@dataclass
class PatternValidation:
"""Validation result for a grep pattern."""
pattern: str
is_valid: bool
issue: Optional[str] = None
suggested_pattern: Optional[str] = None
specificity_score: Literal["high", "medium", "low", "too_broad"] = "medium"
transformed_pattern: Optional[str] = None # Pattern to actually use
class PatternValidator:
"""Validate and improve grep patterns."""
# Patterns that are too broad (these are regex patterns to match against input)
FORBIDDEN_PATTERNS = [
r"^\.\*$", # Just .*
r"^\.$", # Just .
r"^\.{1,2}\*?$", # . or .. or .* or ..*
r"^\.\+$", # Just .+ (escaped dot)
r"^\s*$", # Empty or whitespace
r"^\[.*\]\*$", # [anything]*
r"^\\w\+$", # \w+
r"^\\s\*$", # \s*
]
# Common terms that are too generic without context
# ONLY include real language keywords that will match thousands of lines
TOO_COMMON_TERMS = {
# Python keywords (very short, common)
"def", "if", "else", "for", "in", "is", "or", "and", "not",
"try", "as", "del",
# Go keywords (very short)
"go", "var", "map", "nil",
# Java/TS keywords (very short)
"new", "let", "var",
# VERY generic single-word terms that match everything
"get", "set", "run", "id", "key", "err", "ctx", "req", "res",
}
# Minimum pattern length for non-regex searches
MIN_LITERAL_LENGTH = 4
# Terms that should have context added
NEEDS_CONTEXT_SUGGESTIONS = {
"def": {
"python": r"def\s+[a-z_][a-z0-9_]*\s*\(",
"default": r"def\s+\w+\s*\(",
},
"class": {
"python": r"class\s+[A-Z][a-zA-Z0-9_]*",
"default": r"class\s+[A-Z]\w*",
},
"func": {
"go": r"func\s+(?:\([^)]+\)\s+)?[A-Z][a-zA-Z0-9]*",
"default": r"func\s+\w+",
},
"function": {
"javascript": r"(?:async\s+)?function\s+[a-zA-Z_][a-zA-Z0-9_]*",
"typescript": r"(?:async\s+)?function\s+[a-zA-Z_][a-zA-Z0-9_]*",
"default": r"function\s+\w+",
},
"type": {
"go": r"type\s+[A-Z][a-zA-Z0-9_]*\s+(?:struct|interface)",
"typescript": r"type\s+[A-Z][a-zA-Z0-9_]*\s*=",
"default": r"type\s+\w+",
},
"struct": {
"go": r"type\s+\w+\s+struct\s*\{",
"default": r"struct\s+\w+",
},
"interface": {
"go": r"type\s+\w+\s+interface\s*\{",
"typescript": r"interface\s+[A-Z][a-zA-Z0-9_]*",
"java": r"interface\s+[A-Z][a-zA-Z0-9_]*",
"default": r"interface\s+\w+",
},
"import": {
"python": r"^from\s+\w+|^import\s+\w+",
"go": r'^import\s+(?:\(|")',
"default": r"import\s+\w+",
},
"return": {
"default": r"return\s+[^;}\n]+",
},
}
@classmethod
def validate_pattern(
cls,
pattern: str,
language: Optional[str] = None,
context: Optional[Dict[str, Any]] = None,
) -> PatternValidation:
"""
Validate a grep pattern and suggest improvements.
Args:
pattern: The grep pattern to validate
language: Optional language hint (python, go, java, etc.)
context: Optional dict with 'concepts', 'likely_files', etc.
Returns:
PatternValidation with is_valid=False if pattern is too broad
"""
if not pattern:
return PatternValidation(
pattern=pattern,
is_valid=False,
issue="Empty pattern",
specificity_score="too_broad",
)
pattern = pattern.strip()
# Check forbidden patterns (regex match)
for forbidden in cls.FORBIDDEN_PATTERNS:
if re.match(forbidden, pattern, re.IGNORECASE):
return PatternValidation(
pattern=pattern,
is_valid=False,
issue="Pattern matches everything - too broad",
specificity_score="too_broad",
)
# Check too common terms (exact match, case-insensitive)
pattern_lower = pattern.lower()
if pattern_lower in cls.TOO_COMMON_TERMS:
# Try to get a suggested improvement
suggested = cls._get_suggestion(pattern_lower, language)
transformed = cls._transform_with_context(pattern, language, context)
return PatternValidation(
pattern=pattern,
is_valid=False,
issue=f"'{pattern}' is too common, will match thousands of lines",
suggested_pattern=suggested,
transformed_pattern=transformed,
specificity_score="too_broad",
)
# Check minimum length for literal patterns (no regex chars)
if not cls._is_regex(pattern) and len(pattern) < cls.MIN_LITERAL_LENGTH:
return PatternValidation(
pattern=pattern,
is_valid=False,
issue=f"Pattern too short ({len(pattern)} chars), likely too broad",
specificity_score="low",
)
# Check for patterns that might be too broad
broad_indicators = cls._check_broad_indicators(pattern)
if broad_indicators:
return PatternValidation(
pattern=pattern,
is_valid=True, # Allow but warn
issue=f"Pattern may be too broad: {broad_indicators}",
specificity_score="low",
)
# Estimate specificity
specificity = cls._estimate_specificity(pattern)
return PatternValidation(
pattern=pattern,
is_valid=True,
specificity_score=specificity,
)
@classmethod
def _get_suggestion(cls, term: str, language: Optional[str]) -> Optional[str]:
"""Get a suggested improved pattern for a common term."""
if term not in cls.NEEDS_CONTEXT_SUGGESTIONS:
return None
suggestions = cls.NEEDS_CONTEXT_SUGGESTIONS[term]
if language and language.lower() in suggestions:
return suggestions[language.lower()]
return suggestions.get("default")
@classmethod
def _transform_with_context(
cls,
pattern: str,
language: Optional[str],
context: Optional[Dict[str, Any]],
) -> Optional[str]:
"""
Transform a broad pattern using available context.
If we have concepts from question analysis, combine them
with the pattern to make it more specific.
"""
if not context:
return None
concepts = context.get("concepts", [])
if not concepts:
return None
# Take most specific concept (longest one)
concept = max(concepts, key=len) if concepts else None
if not concept or len(concept) < 4:
return None
# Combine pattern with concept
pattern_lower = pattern.lower()
# Different combination strategies
if pattern_lower in ("class", "def", "func", "function", "type", "struct"):
# Definition + name containing concept
return f"{pattern}.*{concept}|{concept}.*{pattern}"
else:
# General combination
return f"{pattern}[^\\n]*{concept}|{concept}[^\\n]*{pattern}"
@staticmethod
def _is_regex(pattern: str) -> bool:
"""Check if pattern contains regex special chars."""
special_chars = r".*+?^${}[]|()\\"
return any(c in pattern for c in special_chars)
@staticmethod
def _check_broad_indicators(pattern: str) -> Optional[str]:
"""Check for indicators that pattern might be too broad."""
issues = []
# Starts with .* (matches any prefix)
if pattern.startswith(".*"):
issues.append("starts with .*")
# Ends with .* (matches any suffix)
if pattern.endswith(".*") and len(pattern) > 2:
issues.append("ends with .*")
# Multiple .* in pattern
if pattern.count(".*") > 2:
issues.append("too many wildcards")
# Just word characters
if re.match(r"^[a-z_]+$", pattern, re.IGNORECASE) and len(pattern) < 6:
issues.append("short literal word")
return ", ".join(issues) if issues else None
@staticmethod
def _estimate_specificity(pattern: str) -> Literal["high", "medium", "low"]:
"""Estimate how specific a pattern is."""
# Has uppercase (likely class/type name) = more specific
has_upper = any(c.isupper() for c in pattern)
# Has underscore (likely function/variable name) = more specific
has_underscore = "_" in pattern
# Pattern length
length = len(pattern)
# Score
score = 0
if length >= 15:
score += 3
elif length >= 10:
score += 2
elif length >= 6:
score += 1
if has_upper:
score += 1
if has_underscore:
score += 1
# Multi-word patterns (spaces or camelCase transitions)
if " " in pattern or re.search(r"[a-z][A-Z]", pattern):
score += 1
if score >= 4:
return "high"
elif score >= 2:
return "medium"
else:
return "low"
@classmethod
def get_safe_pattern(
cls,
pattern: str,
language: Optional[str] = None,
context: Optional[Dict[str, Any]] = None,
fallback_to_original: bool = True,
) -> str:
"""
Get a safe pattern to use, with automatic transformation if needed.
Args:
pattern: Original pattern
language: Optional language hint
context: Optional context dict
fallback_to_original: If True, return original on validation failure
Returns:
Safe pattern to use (transformed if needed)
"""
validation = cls.validate_pattern(pattern, language, context)
if validation.is_valid:
return pattern
# Try transformed pattern first
if validation.transformed_pattern:
return validation.transformed_pattern
# Try suggested pattern
if validation.suggested_pattern:
return validation.suggested_pattern
# Fallback
if fallback_to_original:
return pattern
# Return something that won't match much
return f"{pattern}_UNLIKELY_SUFFIX_12345"
@classmethod
def validate_batch(
cls,
patterns: List[str],
language: Optional[str] = None,
) -> Dict[str, PatternValidation]:
"""Validate multiple patterns at once."""
return {
pattern: cls.validate_pattern(pattern, language)
for pattern in patterns
}