Skip to main content
Glama
file_validator.py14.4 kB
""" Robust markdown file validation for sync operations. Prevents crashes from: - Weird filenames (unicode, special chars, too long) - Zero-size files - Unreadable contents (encoding issues, binary) - Broken frontmatter (malformed YAML) - Corrupted files - Permission issues Usage: validator = FileValidator() result = validator.validate_file(file_path) if result.is_valid: # Process file safely content = result.content else: # Log error and skip logger.warning("invalid_file", errors=result.errors) """ import os from dataclasses import dataclass, field from pathlib import Path from typing import Optional, List, Dict, Any import logging try: import yaml HAS_YAML = True except ImportError: HAS_YAML = False logger = logging.getLogger(__name__) @dataclass class ValidationResult: """Result of file validation.""" is_valid: bool file_path: str errors: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list) content: Optional[str] = None frontmatter: Optional[Dict[str, Any]] = None encoding: str = "utf-8" size_bytes: int = 0 def add_error(self, error: str): """Add validation error.""" self.errors.append(error) self.is_valid = False def add_warning(self, warning: str): """Add validation warning (doesn't invalidate).""" self.warnings.append(warning) class FileValidator: """ Robust file validator for markdown files. Validates: - Filename safety - File readability - Content encoding - Frontmatter syntax - File size constraints """ # Maximum file size (10 MB default) MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum filename length (Windows limit is 260, but be conservative) MAX_FILENAME_LENGTH = 200 # Minimum file size (0 bytes is suspicious) MIN_FILE_SIZE = 0 # Allow empty files but warn # Reserved Windows filenames WINDOWS_RESERVED = { "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", } # Dangerous filename characters (beyond OS restrictions) DANGEROUS_CHARS = set('<>:"|?*\x00') # Encodings to try in order ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252", "iso-8859-1"] def __init__( self, max_file_size: int = MAX_FILE_SIZE, allow_empty: bool = True, strict_frontmatter: bool = False, ): """ Initialize validator. Args: max_file_size: Maximum file size in bytes allow_empty: Allow empty files (will warn but not error) strict_frontmatter: Require valid frontmatter """ self.max_file_size = max_file_size self.allow_empty = allow_empty self.strict_frontmatter = strict_frontmatter def validate_file(self, file_path: str | Path) -> ValidationResult: """ Validate a markdown file completely. Args: file_path: Path to file to validate Returns: ValidationResult with details """ if file_path is None: return ValidationResult(is_valid=False, file_path="", errors=["File path cannot be None"]) file_path = Path(file_path) result = ValidationResult(is_valid=True, file_path=str(file_path)) # Check file exists if not file_path.exists(): result.add_error(f"File does not exist: {file_path}") return result # Check it's a file (not directory) if not file_path.is_file(): result.add_error(f"Not a file: {file_path}") return result # Validate filename self._validate_filename(file_path, result) # Validate file accessibility self._validate_accessibility(file_path, result) # Validate file size self._validate_size(file_path, result) # If basic checks failed, don't try to read content if not result.is_valid: return result # Try to read content self._validate_content(file_path, result) # If content read successfully, validate frontmatter if result.content is not None: self._validate_frontmatter(result) return result def _validate_filename(self, file_path: Path, result: ValidationResult): """Validate filename is safe.""" filename = file_path.name # Check length if len(filename) > self.MAX_FILENAME_LENGTH: result.add_error( f"Filename too long ({len(filename)} > {self.MAX_FILENAME_LENGTH}): {filename}" ) # Check for dangerous characters dangerous = self.DANGEROUS_CHARS & set(filename) if dangerous: result.add_error( f"Dangerous characters in filename: {dangerous} in {filename}" ) # Check for Windows reserved names name_without_ext = file_path.stem.upper() if name_without_ext in self.WINDOWS_RESERVED: result.add_error(f"Reserved Windows filename: {name_without_ext}") # Check for control characters if any(ord(c) < 32 for c in filename): result.add_error(f"Control characters in filename: {filename}") # Warn about unicode characters (not an error, but worth noting) if not filename.isascii(): result.add_warning(f"Non-ASCII characters in filename: {filename}") # Warn about spaces or special chars that might cause issues if " " in filename: result.add_warning( f"Spaces in filename (consider using underscores): {filename}" ) # Check extension if file_path.suffix.lower() not in [".md", ".markdown"]: result.add_warning( f"Unexpected extension: {file_path.suffix} (expected .md or .markdown)" ) def _validate_accessibility(self, file_path: Path, result: ValidationResult): """Check if file can be accessed.""" try: # Try to stat the file stats = file_path.stat() result.size_bytes = stats.st_size # Check read permissions if not os.access(file_path, os.R_OK): result.add_error(f"No read permission: {file_path}") except PermissionError as e: result.add_error(f"Permission denied: {file_path}: {e}") except OSError as e: result.add_error(f"Cannot access file: {file_path}: {e}") def _validate_size(self, file_path: Path, result: ValidationResult): """Validate file size is reasonable.""" size = result.size_bytes # Check if empty if size == 0: if self.allow_empty: result.add_warning(f"Empty file (0 bytes): {file_path}") else: result.add_error(f"Empty file not allowed: {file_path}") # Check if too large if size > self.max_file_size: result.add_error( f"File too large ({size} > {self.max_file_size}): {file_path}" ) # Warn if suspiciously large for markdown if size > 1024 * 1024: # > 1 MB result.add_warning( f"Large markdown file ({size / 1024 / 1024:.2f} MB): {file_path}" ) def _validate_content(self, file_path: Path, result: ValidationResult): """Try to read file content with multiple encodings.""" content = None last_error = None for encoding in self.ENCODINGS: try: with open(file_path, "r", encoding=encoding, errors="strict") as f: content = f.read() result.encoding = encoding break except UnicodeDecodeError as e: last_error = e continue except Exception as e: result.add_error( f"Failed to read file: {file_path}: {type(e).__name__}: {e}" ) return if content is None: # Try binary read to check if it's actually binary try: with open(file_path, "rb") as f: raw = f.read(1024) # Read first 1KB # Check for null bytes (binary indicator) if b"\x00" in raw: result.add_error( f"Binary file detected (contains null bytes): {file_path}" ) else: result.add_error( f"Encoding error (tried {', '.join(self.ENCODINGS)}): {file_path}: {last_error}" ) except Exception as e: result.add_error(f"Cannot read file at all: {file_path}: {e}") return # Successfully read content result.content = content # Warn if content has weird line endings if "\r\n" in content and "\n" in content.replace("\r\n", ""): result.add_warning(f"Mixed line endings detected: {file_path}") # Warn if very long lines (might be minified/corrupted) lines = content.split("\n") max_line_len = max(len(line) for line in lines) if lines else 0 if max_line_len > 10000: result.add_warning( f"Very long line detected ({max_line_len} chars): {file_path}" ) def _validate_frontmatter(self, result: ValidationResult): """Validate YAML frontmatter if present.""" if not result.content: return # Check for frontmatter markers if not result.content.startswith("---"): # No frontmatter is fine return # Extract frontmatter lines = result.content.split("\n") if len(lines) < 3: # Too short to have valid frontmatter if self.strict_frontmatter: result.add_error("Invalid frontmatter: too short") return # Find closing marker end_idx = None for i, line in enumerate(lines[1:], start=1): if line.strip() == "---" or line.strip() == "...": end_idx = i break if end_idx is None: if self.strict_frontmatter: result.add_error("Invalid frontmatter: no closing marker") else: result.add_warning("Frontmatter appears incomplete (no closing ---)") return # Extract frontmatter content frontmatter_lines = lines[1:end_idx] frontmatter_text = "\n".join(frontmatter_lines) # Try to parse as YAML if HAS_YAML: try: parsed = yaml.safe_load(frontmatter_text) result.frontmatter = parsed if isinstance(parsed, dict) else {} except yaml.YAMLError as e: if self.strict_frontmatter: result.add_error(f"Invalid YAML in frontmatter: {e}") else: result.add_warning(f"Malformed frontmatter YAML: {e}") else: result.add_warning("Cannot validate frontmatter (PyYAML not installed)") def validate_batch( self, file_paths: List[str | Path], on_error: str = "continue" ) -> Dict[str, ValidationResult]: """ Validate multiple files. Args: file_paths: List of files to validate on_error: 'continue' to keep going, 'stop' to halt on first error Returns: Dict mapping file paths to validation results """ results = {} for file_path in file_paths: result = self.validate_file(file_path) results[str(file_path)] = result if on_error == "stop" and not result.is_valid: break return results def get_summary(self, results: Dict[str, ValidationResult]) -> str: """Generate human-readable summary of validation results.""" total = len(results) valid = sum(1 for r in results.values() if r.is_valid) invalid = total - valid total_errors = sum(len(r.errors) for r in results.values()) total_warnings = sum(len(r.warnings) for r in results.values()) summary = f""" # File Validation Summary **Total Files:** {total} **Valid:** {valid} ({valid / total * 100:.1f}%) **Invalid:** {invalid} ({invalid / total * 100:.1f}%) **Errors:** {total_errors} **Warnings:** {total_warnings} ## Invalid Files """ for path, result in results.items(): if not result.is_valid: summary += f"\n### {Path(path).name}\n" for error in result.errors: summary += f"- ❌ {error}\n" for warning in result.warnings: summary += f"- ⚠️ {warning}\n" if invalid == 0: summary += "\n✅ No invalid files!\n" return summary # Convenience function for quick validation def validate_markdown_file(file_path: str | Path) -> ValidationResult: """Quick validation of a single markdown file.""" validator = FileValidator() return validator.validate_file(file_path) # Example usage if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python file_validator.py <file_or_directory>") sys.exit(1) path = Path(sys.argv[1]) validator = FileValidator() if path.is_file(): result = validator.validate_file(path) print(f"\n{'✅ VALID' if result.is_valid else '❌ INVALID'}: {path}") for error in result.errors: print(f" ❌ {error}") for warning in result.warnings: print(f" ⚠️ {warning}") elif path.is_dir(): files = list(path.rglob("*.md")) print(f"\nValidating {len(files)} markdown files...") results = validator.validate_batch(files) print(validator.get_summary(results)) else: print(f"Error: {path} is not a file or directory") sys.exit(1)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sandraschi/notepadpp-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server