ContextForge MCP Gateway

Official

Overview Schema Related Servers Score Discussions

normalize_special_characters.py•24.1 KiB

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """normalize_characters: cleanup AI generated artifacts from code. Copyright 2025 Mihai Criveti SPDX-License-Identifier: Apache-2.0 Authors: Mihai Criveti A **single-file** command-line utility that normalises so-called *"smart"* punctuation, exotic Unicode glyphs, zero-width characters, and AI-generated artefacts to plain ASCII. The intended use-case is cleaning up code blocks from ChatGPT, pasting from the web, or tidying a repository before committing. ## Key features - **No third-party dependencies** - standard library only. - **One portable file** that you can vendor in any project. - **Globs, directories or explicit files** are accepted as positional arguments, just like *black* or *ruff*. - **Dry-run, diff, backup and warnings** switches help you adopt it safely. - **Built-in configuration** - mappings, removals, warnings and ignore globs are all Python literals in this file, making the tool self-documenting. - **Comprehensive ignore patterns** for modern development environments. - **File type whitelist** - only processes specified file types. Usage examples:: # See which files would change and view a coloured unified diff python3 normalize_characters.py "**/*.py" --dry-run --diff # Clean the entire project tree, keeping *.bak* backups of changed files python3 normalize_characters.py . --backup-ext .bak # Normalise Markdown docs verbosely; ignore the vendor directory python3 normalize_characters.py "docs/**/*.md" -v -i "vendor/**/*" # Process only Python files in src/ directory python3 normalize_characters.py "src/**/*.py" --verbose Exit codes: * **0** - success, no changes were necessary. * **1** - at least one file was modified (or would be, in *--dry-run*). The script is intentionally opinionated but easy to fork - simply adjust ``DEFAULT_MAPPING``, ``DEFAULT_REGEX_REMOVE``, etc. to taste. """ # Future from __future__ import annotations # Standard import argparse import difflib import fnmatch import logging from pathlib import Path import re import sys from typing import Dict, Iterable, List, Optional, Pattern, Sequence __all__ = [ "main", "apply_char_map", "apply_removals", "gather_warnings", "find_files", ] __version__ = "2.0.0" _LOG = logging.getLogger("normalize_characters") # --------------------------------------------------------------------------- # Configurable rules – tweak these to suit your project # --------------------------------------------------------------------------- # Whitelist of allowed file extensions (only these files will be processed) DEFAULT_ALLOWED_EXTENSIONS: List[str] = [ # Programming languages ".py", # Python ".js", # JavaScript ".ts", # TypeScript ".jsx", # React JSX ".tsx", # React TypeScript ".html", # HTML ".htm", # HTML ".css", # CSS ".scss", # Sass ".sass", # Sass ".less", # Less CSS ".php", # PHP ".rb", # Ruby ".go", # Go ".rs", # Rust ".java", # Java ".c", # C ".cpp", # C++ ".cxx", # C++ ".cc", # C++ ".h", # C/C++ Header ".hpp", # C++ Header ".hxx", # C++ Header ".cs", # C# ".swift", # Swift ".kt", # Kotlin ".scala", # Scala ".clj", # Clojure ".hs", # Haskell ".ml", # OCaml ".fs", # F# ".dart", # Dart ".lua", # Lua ".r", # R ".m", # Objective-C/MATLAB ".pl", # Perl ".pm", # Perl Module # Shell and scripts ".sh", # Shell script ".bash", # Bash script ".zsh", # Zsh script ".fish", # Fish script ".ps1", # PowerShell ".bat", # Batch file ".cmd", # Command file # Data and config files ".json", # JSON ".yaml", # YAML ".yml", # YAML ".xml", # XML ".toml", # TOML ".ini", # INI file ".cfg", # Config file ".conf", # Config file ".properties", # Properties file ".env", # Environment file # Documentation and text ".md", # Markdown ".rst", # reStructuredText ".txt", # Plain text ".rtf", # Rich text ".tex", # LaTeX ".org", # Org-mode # Database ".sql", # SQL ".sqlite", # SQLite ".psql", # PostgreSQL # Web and markup ".svg", # SVG (text-based) ".vue", # Vue.js ".svelte", # Svelte # Build and project files ".dockerfile", # Dockerfile ".makefile", # Makefile ".gradle", # Gradle ".maven", # Maven ".cmake", # CMake ".gyp", # GYP ".gypi", # GYP # Version control ".gitignore", # Git ignore ".gitattributes", # Git attributes # Without extension (common script files) "Dockerfile", "Makefile", "Rakefile", "Gemfile", "Pipfile", "requirements.txt", "setup.py", "pyproject.toml", "package.json", "tsconfig.json", "webpack.config.js", "rollup.config.js", "vite.config.js", "next.config.js", "nuxt.config.js", "tailwind.config.js", "postcss.config.js", "babel.config.js", "eslint.config.js", ".eslintrc", ".prettierrc", ".babelrc", ".editorconfig", ] # fmt: off # (Keep one-item-per-line style for readability.) DEFAULT_MAPPING: Dict[str, str] = { # "Smart" double quotes & guillemets → plain double quote "“": '"', # U+201C LEFT DOUBLE QUOTATION MARK "”": '"', # U+201D RIGHT DOUBLE QUOTATION MARK "„": '"', # U+201E DOUBLE LOW-9 QUOTATION MARK "‟": '"', # U+201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK "«": '"', # U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK (guillemet) "»": '"', # U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (guillemet) # "Smart" single quotes & apos-like glyphs → plain apostrophe "'": "'", # U+2018 LEFT SINGLE QUOTATION MARK "'": "'", # U+2019 RIGHT SINGLE QUOTATION MARK "’": "'", # APOSTROPHE SINGLE QUOTATION MARK "‚": "'", # U+201A SINGLE LOW-9 QUOTATION MARK "‛": "'", # U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK "ʼ": "'", # U+02BC MODIFIER LETTER APOSTROPHE # Dashes (em, en, figure, minus, etc.) → ASCII hyphen-minus "—": "-", # U+2014 EM DASH "–": "-", # U+2013 EN DASH "‒": "-", # U+2012 FIGURE DASH "‑": "-", # U+2011 NON-BREAKING HYPHEN "‐": "-", # U+2010 HYPHEN "⁃": "-", # U+2043 HYPHEN BULLET "−": "-", # U+2212 MINUS SIGN "﹣": "-", # U+FE63 SMALL HYPHEN-MINUS "－": "-", # U+FF0D FULLWIDTH HYPHEN-MINUS # Ellipsis → three dots "…": "...", # U+2026 HORIZONTAL ELLIPSIS # Bullet & middle dot variants → hyphen for list markup "•": "-", # U+2022 BULLET "·": "-", # U+00B7 MIDDLE DOT "⁌": "-", # U+204C BLACK LEFTWARDS BULLET "⁍": "-", # U+204D BLACK RIGHTWARDS BULLET # Common copyright / trade marks "©": "(c)", # U+00A9 COPYRIGHT SIGN "®": "(r)", # U+00AE REGISTERED SIGN "™": "(tm)", # U+2122 TRADE MARK SIGN # Vulgar fractions – cheap ASCII approximations "¼": "1/4", # U+00BC VULGAR FRACTION ONE QUARTER "½": "1/2", # U+00BD VULGAR FRACTION ONE HALF "¾": "3/4", # U+00BE VULGAR FRACTION THREE QUARTERS # Non-breaking & other exotic spaces → regular space "\u00A0": " ", # NO-BREAK SPACE "\u202F": " ", # NARROW NO-BREAK SPACE "\u205F": " ", # MEDIUM MATHEMATICAL SPACE "\u3000": " ", # IDEOGRAPHIC SPACE (full-width) "\u2000": " ", # EN QUAD "\u2001": " ", # EM QUAD "\u2002": " ", # EN SPACE "\u2003": " ", # EM SPACE "\u2004": " ", # THREE-PER-EM SPACE "\u2005": " ", # FOUR-PER-EM SPACE "\u2006": " ", # SIX-PER-EM SPACE "\u2007": " ", # FIGURE SPACE "\u2008": " ", # PUNCTUATION SPACE "\u2009": " ", # THIN SPACE "\u200A": " ", # HAIR SPACE # Zero-width & byte-order-mark characters – *delete entirely* "\u200B": "", # ZERO WIDTH SPACE "\u200C": "", # ZERO WIDTH NON-JOINER "\u200D": "", # ZERO WIDTH JOINER "\u2060": "", # WORD JOINER "\uFEFF": "", # ZERO WIDTH NO-BREAK SPACE (BOM) } # fmt: on # Patterns to strip out completely (e.g. ChatGPT citation artefacts) DEFAULT_REGEX_REMOVE: List[str] = [ r"::contentReference\[oaicite:\d+]\{index=\d+}", ] # Warn-only patterns – flagged but not auto-fixed DEFAULT_WARN_PATTERNS: List[str] = [ r"\t", # Literal TAB characters r"\r\n", # Windows CRLF line endings r"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]", # Control characters ] # Files & directories to ignore by default (glob syntax) DEFAULT_IGNORES: List[str] = [ # Self-reference - prevent the script from modifying itself "normalize_characters.py", "normalize_special_characters.py", "normalize-characters.py", "character_normalizer.py", "**/normalize_characters.py", "**/normalize_special_characters.py", "**/normalize-characters.py", "**/character_normalizer.py", # Version control ".git", ".git*", ".git/**", "**/.git/**/*", "**/.gitignore", "**/.gitmodules", "**/.gitattributes", "**/.hg/**/*", "**/.svn/**/*", # CI/CD and configuration "**/.github/**/*", "**/.gitlab-ci.yml", "**/.travis.yml", "**/.circleci/**/*", "**/.pre-commit-config.yaml", "**/pre-commit-config.yaml", # Python "**/__pycache__/**/*", "**/*.pyc", "**/*.pyo", "**/*.pyd", "**/.venv/**/*", "**/venv/**/*", "**/env/**/*", "**/.tox/**/*", "**/.coverage", "**/.pytest_cache/**/*", "**/htmlcov/**/*", "**/.mypy_cache/**/*", "**/dist/**/*", "**/build/**/*", "**/*.egg-info/**/*", # Node.js "**/node_modules/**/*", "**/npm-debug.log*", "**/yarn-debug.log*", "**/yarn-error.log*", "**/.npm/**/*", "**/.yarn/**/*", "**/package-lock.json", "**/yarn.lock", # IDEs and editors "**/.vscode/**/*", "**/.idea/**/*", "**/*.swp", "**/*.swo", "**/*~", "**/.DS_Store", "**/Thumbs.db", # Compiled files and binaries "**/*.o", "**/*.so", "**/*.dll", "**/*.exe", "**/*.class", "**/*.jar", # Documentation builds "**/docs/_build/**/*", "**/site/**/*", # Temporary files "**/tmp/**/*", "**/temp/**/*", "**/*.tmp", "**/*.temp", "**/*.log", # Archives "**/*.zip", "**/*.tar.gz", "**/*.tar.bz2", "**/*.rar", "**/*.7z", # Images and media (usually binary) "**/*.png", "**/*.jpg", "**/*.jpeg", "**/*.gif", "**/*.ico", "**/*.mp4", "**/*.avi", "**/*.mov", "**/*.mp3", "**/*.wav", # Fonts "**/*.ttf", "**/*.otf", "**/*.woff", "**/*.woff2", "**/*.eot", # Database "mcp.db", "*.db", "**/*.db", ] # --------------------------------------------------------------------------- # Internal pre-compiled regexes – do not edit below unless you know why. # --------------------------------------------------------------------------- _CHAR_PATTERN = re.compile( "|".join(sorted(map(re.escape, DEFAULT_MAPPING), key=len, reverse=True)) ) _REMOVE_REGEX = [re.compile(p) for p in DEFAULT_REGEX_REMOVE] _WARN_REGEX = [re.compile(p, re.MULTILINE) for p in DEFAULT_WARN_PATTERNS] # --------------------------------------------------------------------------- # Public helper functions (importable by unit tests) # --------------------------------------------------------------------------- def apply_char_map(text: str, mapping: Optional[Dict[str, str]] = None) -> str: """Replace all keys in mapping found in text with their values. Args: text: The input string to normalise. mapping: A custom mapping to use instead of DEFAULT_MAPPING. If None, uses the default mapping. Returns: The transformed string with characters replaced according to the mapping. Examples: >>> apply_char_map('"smart quotes"') '"smart quotes"' >>> apply_char_map('em—dash and en–dash') 'em-dash and en-dash' >>> apply_char_map('custom', {'c': 'k', 'u': 'o'}) 'kostom' >>> apply_char_map('') '' """ if not text: return text char_mapping = mapping if mapping is not None else DEFAULT_MAPPING if not char_mapping: return text rx = _CHAR_PATTERN if mapping is None else re.compile( "|".join(sorted(map(re.escape, char_mapping), key=len, reverse=True)) ) return rx.sub(lambda m: char_mapping[m.group(0)], text) def apply_removals(text: str, patterns: Optional[Iterable[Pattern[str]]] = None) -> str: """Strip substrings that match patterns. Args: text: The input string to process. patterns: Regex patterns to remove. If None, uses _REMOVE_REGEX. Returns: String with matching patterns removed. Examples: >>> apply_removals('text::contentReference[oaicite:1]{index=0}more') 'textmore' >>> apply_removals('hello world', [re.compile(r'world')]) 'hello ' >>> apply_removals('') '' >>> apply_removals('no matches') 'no matches' """ if not text: return text regex_patterns = patterns if patterns is not None else _REMOVE_REGEX result = text for rx in regex_patterns: result = rx.sub("", result) return result def gather_warnings( text: str, src: Path, warn_rx: Optional[Iterable[Pattern[str]]] = None ) -> List[str]: """Return a list of warning strings for each regex that matches text. Args: text: The text content to check for warnings. src: Path to the source file (for warning messages). warn_rx: Warning regex patterns. If None, uses _WARN_REGEX. Returns: List of warning messages for patterns that matched. Examples: >>> from pathlib import Path >>> import re >>> warnings = gather_warnings('text\\t', Path('test.txt')) >>> len(warnings) > 0 # Should warn about tab character True >>> gather_warnings('clean text', Path('test.txt')) [] >>> patterns = [re.compile(r'bad', re.MULTILINE)] >>> gather_warnings('bad text', Path('file.py'), patterns) ["⚠ Warn: 'bad' matched in file.py"] """ if not text: return [] warning_patterns = warn_rx if warn_rx is not None else _WARN_REGEX return [ f"⚠ Warn: {rx.pattern!r} matched in {src}" for rx in warning_patterns if rx.search(text) ] def is_allowed_file(path: Path, allowed_extensions: Optional[Sequence[str]] = None) -> bool: """Check if a file is in the allowed extensions whitelist. Args: path: Path to the file to check. allowed_extensions: List of allowed extensions. If None, uses DEFAULT_ALLOWED_EXTENSIONS. Returns: True if the file should be processed, False otherwise. Examples: >>> is_allowed_file(Path('test.py')) True >>> is_allowed_file(Path('test.exe')) False >>> is_allowed_file(Path('Dockerfile')) True >>> is_allowed_file(Path('test.custom'), ['.custom']) True """ extensions = allowed_extensions if allowed_extensions is not None else DEFAULT_ALLOWED_EXTENSIONS # Check exact filename matches (for files like Dockerfile, Makefile, etc.) if path.name in extensions: return True # Check file extension if path.suffix.lower() in [ext.lower() for ext in extensions]: return True return False def find_files(inputs: Sequence[str], ignore: Sequence[str], allowed_extensions: Optional[Sequence[str]] = None) -> List[Path]: """Expand inputs (files/directories/globs) into a unique list of Path objects. Args: inputs: List of file paths, directory paths, or glob patterns. ignore: List of glob patterns to ignore. allowed_extensions: List of allowed file extensions. If None, uses DEFAULT_ALLOWED_EXTENSIONS. Returns: Sorted list of unique Path objects that match inputs but not ignore patterns and are in the allowed extensions whitelist. Examples: >>> import tempfile >>> import os >>> with tempfile.TemporaryDirectory() as tmpdir: ... # Create test files ... test_py = Path(tmpdir) / 'test.py' ... test_py.write_text('print("hello")') ... test_exe = Path(tmpdir) / 'test.exe' ... test_exe.write_text('binary') ... # Test finding files ... files = find_files([tmpdir], []) ... len([f for f in files if f.name == 'test.py']) == 1 14 True >>> find_files([], []) [] """ if not inputs: return [] paths: List[Path] = [] for token in inputs: p = Path(token) if p.is_file(): paths.append(p) continue if p.is_dir(): token = str(p / "**/*") try: for match in Path().glob(token): if match.is_file(): rel = match.as_posix() if any(fnmatch.fnmatch(rel, pat) for pat in ignore): continue # Check if file is in whitelist if not is_allowed_file(match, allowed_extensions): continue paths.append(match) except OSError: # Handle invalid glob patterns gracefully continue return sorted(set(paths)) # --------------------------------------------------------------------------- # CLI helpers # --------------------------------------------------------------------------- def _diff(before: str, after: str, filename: str) -> str: """Return unified diff between before and after as a single string. Args: before: Original text content. after: Modified text content. filename: Name of the file for diff headers. Returns: Unified diff string, empty if no differences. Examples: >>> diff_output = _diff('old line', 'new line', 'test.txt') >>> 'test.txt:before' in diff_output True >>> 'test.txt:after' in diff_output True >>> _diff('same', 'same', 'test.txt') '' """ return "".join( difflib.unified_diff( before.splitlines(keepends=True), after.splitlines(keepends=True), fromfile=f"{filename}:before", tofile=f"{filename}:after", ) ) def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: """Define and parse all CLI arguments. Args: argv: Command line arguments. If None, uses sys.argv. Returns: Parsed argument namespace. Examples: >>> args = _parse_args(['file.py']) >>> args.inputs ['file.py'] >>> args = _parse_args(['--dry-run', 'file.py']) >>> args.dry_run True >>> args.verbose # Should be True due to dry-run implying verbose True """ p = argparse.ArgumentParser( prog="normalize-characters", formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Normalize smart quotes, exotic whitespace, and AI artefacts to plain ASCII.", ) p.add_argument( "inputs", nargs="+", help="Files, directories or globs (e.g. '**/*.md').", ) p.add_argument( "-i", "--ignore", action="append", default=[], help="Additional ignore patterns (glob syntax).", ) p.add_argument( "--no-default-ignore", action="store_true", help="Disable built-in ignore rules.", ) p.add_argument( "--allowed-extensions", action="append", default=[], help="Additional allowed file extensions (e.g., '.custom').", ) p.add_argument( "--no-default-extensions", action="store_true", help="Disable built-in allowed extensions whitelist.", ) p.add_argument("--dry-run", action="store_true", help="Do not write files (disabled by default).") p.add_argument("--diff", action="store_true", help="Show unified diff.") p.add_argument( "--backup-ext", default="", help="Save backup to <file><ext> before overwrite.", ) p.add_argument( "-q", "--quiet", action="store_true", help="Suppress output (except warnings)." ) p.add_argument("-v", "--verbose", action="store_true", help="Show processed files.") p.add_argument("--version", action="version", version=f"%(prog)s {__version__}") ns = p.parse_args(argv) if ns.diff or ns.dry_run: ns.verbose = True # Imply verbose when printing diff or dry-run if ns.quiet: ns.verbose = False return ns # --------------------------------------------------------------------------- # Main program logic # --------------------------------------------------------------------------- def main(argv: Optional[Sequence[str]] = None) -> None: # noqa: C901 """Entry-point function for normalize-characters CLI. Processes files according to command line arguments, normalizing characters and generating appropriate output/warnings. Args: argv: Command line arguments. If None, uses sys.argv. Examples: >>> import sys >>> from io import StringIO >>> from unittest.mock import patch >>> # Test main with dry-run (would need real files for full test) >>> # This is a simplified example showing the function signature >>> main is not None True """ args = _parse_args(argv) logging.basicConfig( level=logging.DEBUG if args.verbose and not args.quiet else logging.INFO, format="%(message)s", stream=sys.stdout, ) ignore = [] if args.no_default_ignore else list(DEFAULT_IGNORES) ignore.extend(args.ignore) allowed_extensions = [] if args.no_default_extensions else list(DEFAULT_ALLOWED_EXTENSIONS) allowed_extensions.extend(args.allowed_extensions) files = find_files(args.inputs, ignore, allowed_extensions) if not files: _LOG.warning("No files matched.") sys.exit(0) changed = warned = 0 for path in files: try: original = path.read_text(encoding="utf-8", errors="surrogateescape") except Exception as exc: _LOG.warning("Could not read %s: %s", path, exc) continue fixed = apply_char_map(original) fixed = apply_removals(fixed) warnings = gather_warnings(fixed, path) warned += len(warnings) for w in warnings: _LOG.warning(w) if original == fixed: if args.verbose: _LOG.info("✓ %s (no change)", path) continue changed += 1 if args.verbose: _LOG.info("✏ %s", path) if args.diff: sys.stdout.write(_diff(original, fixed, str(path))) if not args.dry_run: try: if args.backup_ext: backup = path.with_suffix(path.suffix + args.backup_ext) backup.write_text(original, encoding="utf-8", errors="surrogateescape") path.write_text(fixed, encoding="utf-8", errors="surrogateescape") except Exception as exc: _LOG.warning("Could not write %s: %s", path, exc) if not args.quiet: _LOG.info( "Processed %d file(s): %d changed, %d warnings%s.", len(files), changed, warned, " (dry-run)" if args.dry_run else "", ) sys.exit(1 if changed else 0) # --------------------------------------------------------------------------- # Stand-alone execution guard # --------------------------------------------------------------------------- if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/IBM/mcp-context-forge'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

normalize_special_characters.py•24.1 KiB