Skip to main content
Glama
encoding.py4.17 kB
import logging from collections import Counter from pathlib import Path from charset_normalizer import from_bytes logger = logging.getLogger(__name__) # File extensions to sample for encoding detection TEXT_FILE_EXTENSIONS = frozenset( { ".py", ".pyi", # Python ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", # JavaScript/TypeScript ".java", ".kt", ".kts", # JVM ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", # C/C++ ".go", # Go ".rs", # Rust ".rb", # Ruby ".php", # PHP ".cs", # C# ".swift", # Swift ".scala", # Scala ".lua", # Lua ".sh", ".bash", ".zsh", # Shell ".sql", # SQL ".html", ".htm", ".xml", ".xhtml", # Markup ".css", ".scss", ".sass", ".less", # Styles ".json", ".yaml", ".yml", ".toml", # Config ".md", ".txt", ".rst", # Text } ) # Encodings that are essentially compatible with UTF-8 (ASCII subset) UTF8_COMPATIBLE = frozenset({"utf-8", "ascii", "us-ascii"}) def detect_project_encoding( base_dir: Path, sample_limit: int = 30, ) -> str | None: """Detect the dominant non-UTF-8 encoding in a project. Scans a sample of text files in the project directory to determine if a regional encoding (e.g., GBK, Big5, Shift_JIS) is predominantly used. Args: base_dir: Project root directory to scan. sample_limit: Maximum number of files to sample. Returns: The detected encoding name (lowercase) if a non-UTF-8 encoding is dominant, or None if the project appears to use UTF-8. """ encoding_counts: Counter[str] = Counter() files_sampled = 0 # Walk through files, prioritizing source code for file_path in base_dir.rglob("*"): if files_sampled >= sample_limit: break # Skip non-files, hidden files, and common non-source directories # Skip symlinks to prevent path traversal attacks and infinite loops if file_path.is_symlink(): continue if not file_path.is_file(): continue if any(part.startswith(".") for part in file_path.parts): continue if any( part in {"node_modules", "__pycache__", "venv", ".venv", "dist", "build"} for part in file_path.parts ): continue # Only sample known text file extensions if file_path.suffix.lower() not in TEXT_FILE_EXTENSIONS: continue try: # Read first 8KB for detection (enough for charset detection) raw = file_path.read_bytes()[:8192] if not raw: continue result = from_bytes(raw).best() if result and result.encoding: enc = result.encoding.lower() encoding_counts[enc] += 1 files_sampled += 1 logger.debug("Detected encoding %s for %s", enc, file_path) except (OSError, PermissionError) as exc: logger.debug("Skipping %s: %s", file_path, exc) continue if not encoding_counts: logger.info("No files sampled for encoding detection") return None logger.info( "Encoding detection sampled %d files: %s", files_sampled, dict(encoding_counts.most_common(5)), ) # Find the most common non-UTF-8 encoding for enc, count in encoding_counts.most_common(): if enc not in UTF8_COMPATIBLE: # If non-UTF-8 encoding appears in at least 30% of sampled files ratio = count / files_sampled if ratio >= 0.3: logger.info( "Detected project encoding: %s (%.1f%% of sampled files)", enc, ratio * 100, ) return enc logger.info("Project appears to use UTF-8 (no dominant regional encoding)") return None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/possible055/relace-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server