security.py•6.01 kB
"""
Security validation and sanitization utilities with sandbox support.
"""
import re
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from config import BLOCKED_DOMAINS, BLOCKED_EXTENSIONS
class SecurityValidator:
"""Comprehensive security validation for web fetching."""
# Patterns that might indicate injection attempts
INJECTION_PATTERNS = [
r'<script[^>]*>.*?</script>',
r'javascript:',
r'on\w+\s*=',
r'eval\s*\(',
r'expression\s*\(',
r'vbscript:',
r'data:text/html',
]
# Path traversal patterns
PATH_TRAVERSAL = [
'..',
'%2e%2e',
'..%2f',
'%2e%2e%2f',
'..\\',
'%5c%2e%2e',
]
def __init__(self, working_dir: Path):
"""Initialize with sandboxed working directory."""
self.working_dir = working_dir.resolve()
@staticmethod
def is_safe_url(url: str) -> tuple[bool, str]:
"""Validate URL safety."""
try:
if not url or not isinstance(url, str):
return False, "Invalid URL format"
# Check length
if len(url) > 2048:
return False, "URL too long"
parsed = urlparse(url)
# Check scheme
if parsed.scheme not in ('http', 'https'):
return False, f"Unsupported scheme: {parsed.scheme}"
# Check hostname
hostname = parsed.hostname
if not hostname:
return False, "Missing hostname"
# Block local addresses
for blocked in BLOCKED_DOMAINS:
if hostname.startswith(blocked) or hostname == blocked.rstrip('.'):
return False, "Access to localhost/private networks blocked"
# Check for path traversal
path = parsed.path.lower()
for pattern in SecurityValidator.PATH_TRAVERSAL:
if pattern in path or pattern in url.lower():
return False, "Path traversal attempt detected"
# Check file extension
for ext in BLOCKED_EXTENSIONS:
if path.endswith(ext):
return False, f"Blocked file type: {ext}"
return True, ""
except Exception as e:
return False, f"URL validation error: {str(e)}"
def sanitize_path(self, path: str) -> Path:
"""
Sanitize file path to prevent traversal and ensure it stays within working directory.
Returns an absolute path within the working directory.
"""
# Remove any path traversal attempts
path = path.replace('..', '').replace('~', '')
# Get just the filename
path = Path(path).name
# Remove special characters except alphanumeric, dash, underscore, dot
path = re.sub(r'[^a-zA-Z0-9._-]', '_', str(path))
return Path(path)
def get_safe_path(self, relative_path: str, base_dir: Path = None) -> tuple[Path, bool]:
"""
Get a safe absolute path within the working directory.
Args:
relative_path: The requested path (can be relative or absolute)
base_dir: Base directory to use (defaults to working_dir)
Returns:
Tuple of (safe_path, is_safe) where safe_path is the resolved path
and is_safe indicates if the path is within the sandbox
"""
if base_dir is None:
base_dir = self.working_dir
# Sanitize the path
sanitized = self.sanitize_path(relative_path)
# Resolve to absolute path
target_path = (base_dir / sanitized).resolve()
# Check if path is within working directory (sandbox check)
try:
target_path.relative_to(self.working_dir)
return target_path, True
except ValueError:
# Path escapes working directory
return target_path, False
def get_relative_path(self, absolute_path: Path) -> str:
"""
Get relative path from working directory.
Useful for returning paths that other agents can use.
"""
try:
return str(absolute_path.relative_to(self.working_dir))
except ValueError:
# Path is outside working directory
return str(absolute_path)
@staticmethod
def sanitize_html_content(html: str) -> str:
"""Remove potentially dangerous HTML content."""
soup = BeautifulSoup(html, 'html.parser')
# Remove dangerous elements
for tag in soup(['script', 'style', 'iframe', 'object', 'embed',
'applet', 'meta', 'link', 'base']):
tag.decompose()
# Remove event handlers
for tag in soup.find_all(True):
for attr in list(tag.attrs.keys()):
if attr.startswith('on') or attr in ['formaction', 'action']:
del tag.attrs[attr]
return str(soup)
@staticmethod
def is_prompt_injection(text: str) -> bool:
"""Detect potential prompt injection attempts."""
if not text:
return False
text_lower = text.lower()
# Suspicious patterns
injection_indicators = [
'ignore previous',
'ignore all previous',
'disregard previous',
'forget previous',
'new instructions',
'system prompt',
'you are now',
'your new role',
'sudo',
'admin mode',
'developer mode',
]
for indicator in injection_indicators:
if indicator in text_lower:
return True
return False