Skip to main content
Glama
file_io.py9.04 kB
import logging import os import uuid from pathlib import Path from charset_normalizer import from_bytes from .exceptions import EncodingDetectionError logger = logging.getLogger(__name__) # Module-level state for project encoding (set by server at startup) _project_encoding: str | None = None # Encodings that are effectively UTF-8 for our purposes _UTF8_COMPATIBLE = frozenset({"utf-8", "utf-8-sig", "ascii", "us-ascii"}) def set_project_encoding(encoding: str | None) -> None: """Set the project-level default encoding. Called by server during initialization after encoding detection. Args: encoding: The detected or configured project encoding (e.g., "gbk"). """ global _project_encoding _project_encoding = encoding.lower() if encoding else None if _project_encoding: logger.info("Project encoding set to: %s", _project_encoding) def get_project_encoding() -> str | None: """Get the current project-level encoding.""" return _project_encoding def _looks_like_binary(data: bytes, sample_size: int = 8192) -> bool: """Check if data looks like binary (non-text) content. Uses null byte detection and high ratio of non-printable characters. Args: data: Raw bytes to check. sample_size: Number of bytes to sample. Returns: True if data appears to be binary. """ if not data: return False sample = data[:sample_size] # Null bytes are a strong indicator of binary if b"\x00" in sample: return True # Check ratio of non-text bytes (excluding common control chars) # Allow: tab (0x09), newline (0x0a), carriage return (0x0d), and printable ASCII non_text_count = 0 for byte in sample: if byte < 0x09 or (0x0E <= byte < 0x20 and byte != 0x1B): # Allow ESC for ANSI non_text_count += 1 # If more than 30% non-text bytes, likely binary if len(sample) > 0 and non_text_count / len(sample) > 0.30: return True return False def _detect_declared_encoding(path: Path, raw: bytes) -> str | None: """Detect a file-declared encoding (when available). For Python files, prefer the PEP 263 coding cookie / BOM via tokenize.detect_encoding. Returns None when no declaration is found or detection fails. """ if path.suffix.lower() not in {".py", ".pyi"}: return None # tokenize.detect_encoding is the canonical implementation for Python source encoding. # It handles UTF-8 BOM and coding cookies on the first two lines. try: import io import tokenize encoding, _ = tokenize.detect_encoding(io.BytesIO(raw).readline) return encoding.lower() if encoding else None except Exception: return None def decode_text_with_fallback( raw: bytes, *, path: Path | None = None, preferred_encoding: str | None = None, min_coherence: float = 0.5, ) -> tuple[str, str]: """Decode bytes into text with best-effort encoding detection. Strategy (ordered): 1) Reject likely-binary files 2) Honor declared encoding for Python source (PEP 263) 3) UTF-8 (common default) 4) charset_normalizer detection (handles GBK/Big5/etc reliably vs naive fallbacks) 5) Preferred encoding as last resort (if set) Returns: (decoded_text, encoding_name_lowercase) Raises: EncodingDetectionError: When decoding cannot be performed confidently. """ if _looks_like_binary(raw): raise EncodingDetectionError(str(path) if path else "<bytes>") preferred = preferred_encoding.lower() if preferred_encoding else None declared = _detect_declared_encoding(path, raw) if path else None if declared and declared not in _UTF8_COMPATIBLE: try: return raw.decode(declared), declared except (UnicodeDecodeError, LookupError): # Fall through to other strategies pass # Fast path: UTF-8 try: text = raw.decode("utf-8") # If the project encoding is configured and the file is ASCII-only, preserve # the project encoding to keep newly written non-ASCII content consistent. if preferred and preferred not in _UTF8_COMPATIBLE and text.isascii(): return text, preferred return text, "utf-8" except UnicodeDecodeError: pass def encoding_family(enc: str) -> str: e = enc.lower() if e.startswith("gb") or e in {"hz-gb-2312", "hz"}: return "gb" if "big5" in e or e in {"cp950"}: return "big5" return "other" preferred_text: str | None = None if preferred and preferred not in _UTF8_COMPATIBLE: try: preferred_text = raw.decode(preferred) except (UnicodeDecodeError, LookupError): preferred_text = None # Robust path: charset_normalizer (helps distinguish GBK vs Big5 without naive fallbacks) result = from_bytes(raw) best = result.best() if best is not None and best.encoding: best_enc = best.encoding.lower() best_ok = best.coherence >= min_coherence if preferred_text is not None: # Type narrowing: preferred_text is not None implies preferred is not None # (due to the check at line 149: `if preferred and preferred not in _UTF8_COMPATIBLE`) assert preferred is not None # nosec B101 - type narrowing only # When a project encoding is explicitly configured/detected, prefer it unless # charset_normalizer is strongly confident it's a different encoding family. if encoding_family(preferred) == encoding_family(best_enc): return preferred_text, preferred override_ok = best.coherence >= max(min_coherence, 0.5) if best_ok and override_ok: return str(best), best_enc return preferred_text, preferred if best_ok: return str(best), best_enc # Last resort: preferred encoding (if configured/detected) if preferred_text is not None: assert preferred is not None # nosec B101 - type narrowing only return preferred_text, preferred raise EncodingDetectionError(str(path) if path else "<bytes>") def decode_text_best_effort( raw: bytes, *, path: Path | None = None, preferred_encoding: str | None = None, errors: str = "replace", ) -> str | None: """Decode bytes into text for display/search/sync. Returns None for likely-binary content. Never raises due to decoding. """ if _looks_like_binary(raw): return None try: text, _ = decode_text_with_fallback( raw, path=path, preferred_encoding=preferred_encoding, min_coherence=0.2, ) return text except EncodingDetectionError: # As a last resort, return a lossy UTF-8 decode for robustness. return raw.decode("utf-8", errors=errors) def read_text_with_fallback(path: Path) -> tuple[str, str]: """Read text file with automatic encoding detection. Strategy: 1) Reject likely-binary files (fast heuristic) 2) Honor Python source declarations (PEP 263) 3) Try UTF-8 4) Use charset_normalizer detection for legacy encodings (GBK/Big5/etc) 5) Prefer configured project encoding when appropriate Args: path: File path. Returns: (content, encoding) tuple. Raises: EncodingDetectionError: If encoding cannot be detected or file is not text. """ raw = path.read_bytes() return decode_text_with_fallback( raw, path=path, preferred_encoding=_project_encoding, # Be tolerant here: binary files are filtered earlier, and short source files # can yield very low coherence scores even when the encoding is correct. min_coherence=0.0, ) def atomic_write(path: Path, content: str, encoding: str) -> None: """Atomically write to file (using temp file + os.replace). Atomic write prevents file corruption if interrupted during write. Uses unique temp file names to avoid collisions during concurrent writes. Args: path: Target file path. content: Content to write. encoding: Encoding. Raises: OSError: Raised when write fails. """ # Use uuid to generate unique temp file name, avoiding concurrent write collisions unique_suffix = f".{uuid.uuid4().hex[:8]}.tmp" temp_path = path.with_suffix(path.suffix + unique_suffix) try: # Use open with newline='' to preserve original line endings on all platforms. # Without this, Windows would convert \n to \r\n in text mode. with temp_path.open("w", encoding=encoding, newline="") as f: f.write(content) # os.replace is atomic on POSIX systems os.replace(temp_path, path) except Exception: # Clean up temp file temp_path.unlink(missing_ok=True) raise

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/possible055/relace-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server