Skip to main content
Glama
codebase_snapshot.py16 kB
"""Codebase snapshot functionality for fetching and filtering repository files.""" import fnmatch from pathlib import Path from yellhorn_mcp.utils.git_utils import run_git_command # Global set of file patterns and extensions to always ignore # These are files that should never be included in AI context as they are: # - Binary/compiled artifacts # - Auto-generated files # - Environment-specific configurations # - Transient cache/logs ALWAYS_IGNORE_PATTERNS = { # === Compiled binaries & libraries === # Machine-code artifacts with no source information "*.exe", "*.dll", "*.so", "*.dylib", "*.bin", "*.pyc", "*.pyo", "*.pyd", "*.class", "*.o", "*.a", "*.lib", "*.ko", "*.elf", "*.dSYM/", "*.wasm", # === Databases & model checkpoints === # Large binary blobs, ML checkpoints, experiment tracking "*.db", "*.sqlite", "*.sqlite3", "*.sqlite-journal", "*.sqlite-wal", "milvus.db", "milvus.db.lock", "*models_ckpt_cache*", "*/wandb/*", "wandb/", "mlruns/", "mlartifacts/", "*.ipynb_checkpoints", ".ipynb_checkpoints/", "*.ckpt", "*.h5", "*.hdf5", "*.pkl", "*.pickle", "*.joblib", "*.npy", "*.npz", "*.parquet", "*.feather", "*.arrow", # === Lock files & dependency manifests === # Auto-generated to pin dependency versions "*.lock", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "poetry.lock", "uv.lock", "Pipfile.lock", "composer.lock", "Gemfile.lock", "go.sum", "Cargo.lock", "pubspec.lock", "mix.lock", # === Environment & configuration files === ".env", ".env.*", ".env.local", ".env.development", ".env.production", ".env.test", "*.local", "*local-only*", # === Logs & run outputs === # Ephemeral run or CI output "*.log", "*.log.*", "logs/", "log/", "super-linter.log", "events.out.tfevents*", "*.tfevents", "go/logs.jsonl", "*.out", "*.err", "nohup.out", "*.pid", # === Test artifacts & outputs === # Generated when running tests "*.test", ".test", "test_output/", "testoutput/", "test-results/", "*/__test__/", "*/__tests__/", "*.snap", "*.snapshot", "coverage/", "htmlcov/", ".coverage", "coverage.xml", "*.coverage", "*.cover", "*.gcov", "*.gcda", "*.gcno", "junit.xml", "test-report.xml", # === Cache directories === # Speed up tooling, never manually edited "__pycache__/", ".mypy_cache/", ".pytest_cache/", ".ruff_cache/", ".hypothesis/", ".tox/", ".nox/", ".pyre/", "go/tmp/", "go/temp/", "tmp/", "temp/", "cache/", ".cache/", ".sass-cache/", ".parcel-cache/", ".next/", ".nuxt/", ".vuepress/dist/", ".docusaurus/", ".serverless/", ".fusebox/", ".dynamodb/", ".yarn/cache/", ".yarn/install-state.gz", # === Virtual environments === "env/", "venv/", "virtualenv/", ".venv/", "ENV/", "env.bak/", "venv.bak/", # === IDE & editor files === ".idea/", ".vscode/", "*.swp", "*.swo", "*.swn", "*.bak", "*~", ".project", ".classpath", ".settings/", "*.sublime-project", "*.sublime-workspace", ".kate-swp", ".ropeproject/", # === Build artifacts & outputs === "build/", "dist/", "out/", "target/", "bin/", "obj/", "*.egg-info/", "*.egg", "develop-eggs/", "downloads/", "eggs/", ".eggs/", "lib/", "lib64/", "parts/", "sdist/", "var/", "wheels/", "*.whl", "share/python-wheels/", "MANIFEST", # === Node.js specific === "node_modules/", "jspm_packages/", "bower_components/", ".npm/", "web_modules/", ".pnp.*", # === Container & infrastructure files === "**/Dockerfile*", "docker-compose*.yml", "docker-compose*.yaml", "*.override.*", "*/entrypoint.sh", ".dockerignore", "go/run", "go/runx.sh", # === Terraform & IaC === ".terraform/", ".terraform.lock.hcl", "*.tfstate", "*.tfstate.*", "*.tfplan", "*.tfvars", "terraform.tfvars", # === Generated documentation === "site/", "docs/_build/", "docs/.vuepress/dist/", "_site/", ".jekyll-cache/", ".jekyll-metadata", "public/", "*.pdf", # === Archive files === "*.zip", "*.tar", "*.tar.gz", "*.tgz", "*.tar.bz2", "*.tbz2", "*.tar.xz", "*.txz", "*.rar", "*.7z", "*.gz", "*.bz2", "*.xz", "*.Z", "*.deb", "*.rpm", "*.dmg", "*.iso", "*.jar", "*.war", "*.ear", # === Media files === # Images "*.png", "*.jpg", "*.jpeg", "*.gif", "*.bmp", "*.svg", "*.ico", "*.webp", "*.tiff", "*.tif", "*.psd", "*.ai", "*.eps", "*.raw", # Video "*.mp4", "*.avi", "*.mov", "*.wmv", "*.flv", "*.webm", "*.mkv", "*.m4v", "*.mpg", "*.mpeg", "*.3gp", # Audio "*.mp3", "*.wav", "*.flac", "*.aac", "*.ogg", "*.wma", "*.m4a", "*.opus", "*.ape", # === Fonts === "*.ttf", "*.otf", "*.woff", "*.woff2", "*.eot", # === OS specific files === ".DS_Store", "Thumbs.db", "desktop.ini", "*.lnk", "Icon\r", ".Spotlight-V100/", ".Trashes/", "ehthumbs.db", "ehthumbs_vista.db", "*.stackdump", "[Dd]esktop.ini", "$RECYCLE.BIN/", # === Temporary files === "*.tmp", "*.temp", "*.bak", "*.backup", "*.old", "*.orig", "*.rej", "*.BACKUP.*", "*.BASE.*", "*.LOCAL.*", "*.REMOTE.*", # === Security & secrets === "*.key", "*.pem", "*.p12", "*.pfx", "*.cert", "*.crt", "*.csr", "*.jks", "*.keystore", "id_rsa", "id_rsa.pub", "id_dsa", "id_dsa.pub", "*.gpg", # === Version control & tool config === ".git/", ".gitignore", ".gitattributes", ".gitmodules", ".hg/", ".hgignore", ".svn/", ".bzr/", ".yellhornignore", ".yellhorncontext", ".continueignore", ".cursorignore", ".sourcery.yaml", ".pre-commit-config.yaml", ".editorconfig", ".prettierrc*", ".eslintrc*", ".stylelintrc*", ".markdownlint*", # === Example & fixture files === "example-*.yml", "example-*.yaml", "example-*.json", "fixture-*", "fixtures/", "examples/", "samples/", # === Minified files === "*.min.js", "*.min.css", "*.min.map", # === Source maps === "*.map", "*.js.map", "*.css.map", } def matches_pattern(path: str, pattern: str) -> bool: if pattern.endswith("/"): # Directory pattern - check if file is within this directory return path.startswith(pattern) or fnmatch.fnmatch(path + "/", pattern) else: # File pattern return fnmatch.fnmatch(path, pattern) async def get_codebase_snapshot( repo_path: Path, just_paths: bool = False, log_function=print, git_command_func=None ) -> tuple[list[str], dict[str, str]]: """Get a snapshot of the codebase. Args: repo_path: Path to the repository. just_paths: If True, return only file paths without contents. log_function: Function to use for logging. git_command_func: Optional Git command function (for mocking). Returns: Tuple of (file_paths, file_contents). """ mode_str = "paths" if just_paths else "full" log_function(f"Getting codebase snapshot in mode: {mode_str}") # Get the .gitignore patterns gitignore_patterns = [] gitignore_path = repo_path / ".gitignore" if gitignore_path.exists(): gitignore_patterns = [ line.strip() for line in gitignore_path.read_text().strip().split("\n") if line.strip() and not line.strip().startswith("#") ] log_function(f"Found .gitignore with {len(gitignore_patterns)} patterns") # Get tracked files tracked_files = await run_git_command(repo_path, ["ls-files"], git_command_func) tracked_file_list = tracked_files.strip().split("\n") if tracked_files else [] # Get untracked files (not ignored by .gitignore) untracked_files = await run_git_command( repo_path, ["ls-files", "--others", "--exclude-standard"], git_command_func ) untracked_file_list = untracked_files.strip().split("\n") if untracked_files else [] # Combine all files all_files = set(tracked_file_list + untracked_file_list) # Filter out empty strings all_files = {f for f in all_files if f} # Check for additional ignore files (.yellhornignore and .yellhorncontext) yellhornignore_path = repo_path / ".yellhornignore" yellhornignore_patterns = [] if yellhornignore_path.exists(): yellhornignore_patterns = [ line.strip() for line in yellhornignore_path.read_text().strip().split("\n") if line.strip() and not line.strip().startswith("#") ] log_function(f"Found .yellhornignore with {len(yellhornignore_patterns)} patterns") # Parse .yellhorncontext patterns (supports blacklist, whitelist, and negation) yellhorncontext_path = repo_path / ".yellhorncontext" context_blacklist_patterns = [] context_whitelist_patterns = [] context_negation_patterns = [] if yellhorncontext_path.exists(): lines = [ line.strip() for line in yellhorncontext_path.read_text().strip().split("\n") if line.strip() and not line.strip().startswith("#") ] # Separate patterns by type for line in lines: if line.startswith("!"): # Blacklist pattern (exclude this directory/file) context_blacklist_patterns.append(line[1:]) # Remove the '!' prefix else: # All other patterns are whitelist (directories/files to include) context_whitelist_patterns.append(line) log_function( f"Found .yellhorncontext with {len(context_whitelist_patterns)} whitelist, " f"{len(context_blacklist_patterns)} blacklist, and {len(context_negation_patterns)} negation patterns" ) # Categorize files by priority # Priority: yellhorncontext whitelist > yellhorncontext blacklist > yellhornignore whitelist > yellhornignore blacklist > gitignore blacklist # Categories for files yellhorncontext_whitelist_files = [] yellhorncontext_blacklist_files = [] yellhornignore_whitelist_files = [] yellhornignore_blacklist_files = [] gitignore_blacklist_files = [] other_files = [] always_ignored_count = 0 # Parse .yellhornignore patterns to separate whitelist and blacklist yellhornignore_whitelist_patterns = [] yellhornignore_blacklist_patterns = [] for pattern in yellhornignore_patterns: if pattern.startswith("!"): # Whitelist pattern in yellhornignore (negation) yellhornignore_whitelist_patterns.append(pattern[1:]) else: # Blacklist pattern in yellhornignore yellhornignore_blacklist_patterns.append(pattern) # Process each file and categorize it (excluding always-ignored files) for file_path in sorted(all_files): # Skip files matching always-ignore patterns should_ignore = False for pattern in ALWAYS_IGNORE_PATTERNS: if matches_pattern(file_path, pattern): should_ignore = True break if should_ignore: always_ignored_count += 1 continue # Determine which category this file belongs to is_context_whitelisted = any( matches_pattern(file_path, p) for p in context_whitelist_patterns ) is_context_blacklisted = any( matches_pattern(file_path, p) for p in context_blacklist_patterns ) is_ignore_whitelisted = any( matches_pattern(file_path, p) for p in yellhornignore_whitelist_patterns ) is_ignore_blacklisted = any( matches_pattern(file_path, p) for p in yellhornignore_blacklist_patterns ) # If we have context whitelist patterns, only include files that match them if context_whitelist_patterns and not is_context_whitelisted: continue # Skip files not in whitelist # Categorize based on priority if is_context_whitelisted and not is_context_blacklisted: yellhorncontext_whitelist_files.append(file_path) elif is_context_whitelisted and is_context_blacklisted: yellhorncontext_blacklist_files.append(file_path) elif is_context_blacklisted: yellhorncontext_blacklist_files.append(file_path) elif is_ignore_whitelisted: yellhornignore_whitelist_files.append(file_path) elif is_ignore_blacklisted: yellhornignore_blacklist_files.append(file_path) else: # File doesn't match any special patterns other_files.append(file_path) # Files to include in priority order (excluding blacklisted files) # Priority: yellhorncontext whitelist > yellhornignore whitelist > other files (only if no .yellhorncontext) if yellhorncontext_path.exists(): # If .yellhorncontext exists, only include whitelisted files files_to_include = yellhorncontext_whitelist_files else: # If no .yellhorncontext, include other files as well files_to_include = yellhornignore_whitelist_files + other_files # Log filtering results total_files = len(all_files) log_function(f"File categorization results out of {total_files} files:") if always_ignored_count > 0: log_function(f" - {always_ignored_count} always ignored (images, binaries, configs, etc.)") log_function( f" - {len(yellhorncontext_whitelist_files)} in yellhorncontext whitelist (included)" ) log_function( f" - {len(yellhorncontext_blacklist_files)} in yellhorncontext blacklist (excluded)" ) log_function( f" - {len(yellhornignore_whitelist_files)} in yellhornignore whitelist (included)" ) log_function( f" - {len(yellhornignore_blacklist_files)} in yellhornignore blacklist (excluded)" ) if yellhorncontext_path.exists(): log_function(f" - {len(other_files)} other files (excluded - .yellhorncontext exists)") else: log_function(f" - {len(other_files)} other files (included - no .yellhorncontext)") log_function( f"Total included: {len(files_to_include)} files (excluded {always_ignored_count} always-ignored files)" ) # Use the prioritized list of files to include file_paths = files_to_include # If just_paths is True, return empty file contents if just_paths: return file_paths, {} # Read file contents for full mode file_contents = {} MAX_FILE_SIZE = 1024 * 1024 # 1MB limit per file skipped_large_files = 0 for file_path in file_paths: full_path = repo_path / file_path try: # Check file size first if full_path.stat().st_size > MAX_FILE_SIZE: skipped_large_files += 1 continue # Try to read as text content = full_path.read_text(encoding="utf-8", errors="ignore") file_contents[file_path] = content except Exception: # Skip files that can't be read continue if skipped_large_files > 0: log_function(f"Skipped {skipped_large_files} files larger than 1MB") log_function(f"Read contents of {len(file_contents)} files") return file_paths, file_contents

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/msnidal/yellhorn-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server