"""Injection detector for prompt injection attempts in user speech.
Detects common OWASP-validated prompt injection patterns while
minimizing false positives on legitimate conversation.
"""
import re
from typing import List, Optional, Tuple
class InjectionDetector:
"""Detects prompt injection attempts in user speech.
Uses phrase-based patterns (not single words) to avoid false positives.
Returns the pattern name for logging/debugging when an injection is detected.
Example:
detector = InjectionDetector()
# Check if text contains injection
if not detector.is_safe(user_text):
pattern = detector.detect(user_text)
print(f"Blocked injection: {pattern}")
"""
# Warning message to speak when injection detected
WARNING_MESSAGE: str = "I detected a potentially harmful prompt and blocked it."
# OWASP-validated injection patterns
# Each tuple is (pattern_name, compiled_regex)
# Uses phrase patterns to avoid false positives on single words
PATTERNS: List[Tuple[str, re.Pattern]] = [
("ignore_instructions", re.compile(
r'ignore\s+(all\s+)?previous\s+instructions?',
re.IGNORECASE
)),
("developer_mode", re.compile(
r'you\s+are\s+now\s+(in\s+)?developer\s+mode',
re.IGNORECASE
)),
("system_override", re.compile(
r'system\s+override',
re.IGNORECASE
)),
("reveal_prompt", re.compile(
r'reveal\s+(your\s+)?(system\s+)?prompt',
re.IGNORECASE
)),
("disregard", re.compile(
r'disregard\s+(all\s+)?(previous\s+)?instructions?',
re.IGNORECASE
)),
("bypass", re.compile(
r'bypass\s+(safety|security|restrictions?)',
re.IGNORECASE
)),
("jailbreak", re.compile(
r'(DAN|do\s+anything\s+now|jailbreak)',
re.IGNORECASE
)),
("new_instructions", re.compile(
r'(forget|ignore)\s+(everything|all)\s+(and|then)\s+(follow|do)',
re.IGNORECASE
)),
]
def detect(self, text: str) -> Optional[str]:
"""Detect prompt injection patterns in text.
Args:
text: The user input text to check.
Returns:
The name of the detected pattern if an injection is found,
or None if the text is safe.
"""
# Handle None or empty text as safe
if not text or not text.strip():
return None
# Check each pattern
for pattern_name, pattern in self.PATTERNS:
if pattern.search(text):
return pattern_name
return None
def is_safe(self, text: str) -> bool:
"""Check if text is safe (no injection detected).
Args:
text: The user input text to check.
Returns:
True if text is safe, False if injection detected.
"""
return self.detect(text) is None