We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/msnidal/yellhorn-mcp'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
"""Codebase snapshot functionality for fetching and filtering repository files."""
import fnmatch
from pathlib import Path
from yellhorn_mcp.utils.git_utils import run_git_command
# Global set of file patterns and extensions to always ignore
# These are files that should never be included in AI context as they are:
# - Binary/compiled artifacts
# - Auto-generated files
# - Environment-specific configurations
# - Transient cache/logs
ALWAYS_IGNORE_PATTERNS = {
# === Compiled binaries & libraries ===
# Machine-code artifacts with no source information
"*.exe",
"*.dll",
"*.so",
"*.dylib",
"*.bin",
"*.pyc",
"*.pyo",
"*.pyd",
"*.class",
"*.o",
"*.a",
"*.lib",
"*.ko",
"*.elf",
"*.dSYM/",
"*.wasm",
# === Databases & model checkpoints ===
# Large binary blobs, ML checkpoints, experiment tracking
"*.db",
"*.sqlite",
"*.sqlite3",
"*.sqlite-journal",
"*.sqlite-wal",
"milvus.db",
"milvus.db.lock",
"*models_ckpt_cache*",
"*/wandb/*",
"wandb/",
"mlruns/",
"mlartifacts/",
"*.ipynb_checkpoints",
".ipynb_checkpoints/",
"*.ckpt",
"*.h5",
"*.hdf5",
"*.pkl",
"*.pickle",
"*.joblib",
"*.npy",
"*.npz",
"*.parquet",
"*.feather",
"*.arrow",
# === Lock files & dependency manifests ===
# Auto-generated to pin dependency versions
"*.lock",
"package-lock.json",
"yarn.lock",
"pnpm-lock.yaml",
"poetry.lock",
"uv.lock",
"Pipfile.lock",
"composer.lock",
"Gemfile.lock",
"go.sum",
"Cargo.lock",
"pubspec.lock",
"mix.lock",
# === Environment & configuration files ===
".env",
".env.*",
".env.local",
".env.development",
".env.production",
".env.test",
"*.local",
"*local-only*",
# === Logs & run outputs ===
# Ephemeral run or CI output
"*.log",
"*.log.*",
"logs/",
"log/",
"super-linter.log",
"events.out.tfevents*",
"*.tfevents",
"go/logs.jsonl",
"*.out",
"*.err",
"nohup.out",
"*.pid",
# === Test artifacts & outputs ===
# Generated when running tests
"*.test",
".test",
"test_output/",
"testoutput/",
"test-results/",
"*/__test__/",
"*/__tests__/",
"*.snap",
"*.snapshot",
"coverage/",
"htmlcov/",
".coverage",
"coverage.xml",
"*.coverage",
"*.cover",
"*.gcov",
"*.gcda",
"*.gcno",
"junit.xml",
"test-report.xml",
# === Cache directories ===
# Speed up tooling, never manually edited
"__pycache__/",
".mypy_cache/",
".pytest_cache/",
".ruff_cache/",
".hypothesis/",
".tox/",
".nox/",
".pyre/",
"go/tmp/",
"go/temp/",
"tmp/",
"temp/",
"cache/",
".cache/",
".sass-cache/",
".parcel-cache/",
".next/",
".nuxt/",
".vuepress/dist/",
".docusaurus/",
".serverless/",
".fusebox/",
".dynamodb/",
".yarn/cache/",
".yarn/install-state.gz",
# === Virtual environments ===
"env/",
"venv/",
"virtualenv/",
".venv/",
"ENV/",
"env.bak/",
"venv.bak/",
# === IDE & editor files ===
".idea/",
".vscode/",
"*.swp",
"*.swo",
"*.swn",
"*.bak",
"*~",
".project",
".classpath",
".settings/",
"*.sublime-project",
"*.sublime-workspace",
".kate-swp",
".ropeproject/",
# === Build artifacts & outputs ===
"build/",
"dist/",
"out/",
"target/",
"bin/",
"obj/",
"*.egg-info/",
"*.egg",
"develop-eggs/",
"downloads/",
"eggs/",
".eggs/",
"lib/",
"lib64/",
"parts/",
"sdist/",
"var/",
"wheels/",
"*.whl",
"share/python-wheels/",
"MANIFEST",
# === Node.js specific ===
"node_modules/",
"jspm_packages/",
"bower_components/",
".npm/",
"web_modules/",
".pnp.*",
# === Container & infrastructure files ===
"**/Dockerfile*",
"docker-compose*.yml",
"docker-compose*.yaml",
"*.override.*",
"*/entrypoint.sh",
".dockerignore",
"go/run",
"go/runx.sh",
# === Terraform & IaC ===
".terraform/",
".terraform.lock.hcl",
"*.tfstate",
"*.tfstate.*",
"*.tfplan",
"*.tfvars",
"terraform.tfvars",
# === Generated documentation ===
"site/",
"docs/_build/",
"docs/.vuepress/dist/",
"_site/",
".jekyll-cache/",
".jekyll-metadata",
"public/",
"*.pdf",
# === Archive files ===
"*.zip",
"*.tar",
"*.tar.gz",
"*.tgz",
"*.tar.bz2",
"*.tbz2",
"*.tar.xz",
"*.txz",
"*.rar",
"*.7z",
"*.gz",
"*.bz2",
"*.xz",
"*.Z",
"*.deb",
"*.rpm",
"*.dmg",
"*.iso",
"*.jar",
"*.war",
"*.ear",
# === Media files ===
# Images
"*.png",
"*.jpg",
"*.jpeg",
"*.gif",
"*.bmp",
"*.svg",
"*.ico",
"*.webp",
"*.tiff",
"*.tif",
"*.psd",
"*.ai",
"*.eps",
"*.raw",
# Video
"*.mp4",
"*.avi",
"*.mov",
"*.wmv",
"*.flv",
"*.webm",
"*.mkv",
"*.m4v",
"*.mpg",
"*.mpeg",
"*.3gp",
# Audio
"*.mp3",
"*.wav",
"*.flac",
"*.aac",
"*.ogg",
"*.wma",
"*.m4a",
"*.opus",
"*.ape",
# === Fonts ===
"*.ttf",
"*.otf",
"*.woff",
"*.woff2",
"*.eot",
# === OS specific files ===
".DS_Store",
"Thumbs.db",
"desktop.ini",
"*.lnk",
"Icon\r",
".Spotlight-V100/",
".Trashes/",
"ehthumbs.db",
"ehthumbs_vista.db",
"*.stackdump",
"[Dd]esktop.ini",
"$RECYCLE.BIN/",
# === Temporary files ===
"*.tmp",
"*.temp",
"*.bak",
"*.backup",
"*.old",
"*.orig",
"*.rej",
"*.BACKUP.*",
"*.BASE.*",
"*.LOCAL.*",
"*.REMOTE.*",
# === Security & secrets ===
"*.key",
"*.pem",
"*.p12",
"*.pfx",
"*.cert",
"*.crt",
"*.csr",
"*.jks",
"*.keystore",
"id_rsa",
"id_rsa.pub",
"id_dsa",
"id_dsa.pub",
"*.gpg",
# === Version control & tool config ===
".git/",
".gitignore",
".gitattributes",
".gitmodules",
".hg/",
".hgignore",
".svn/",
".bzr/",
".yellhornignore",
".yellhorncontext",
".continueignore",
".cursorignore",
".sourcery.yaml",
".pre-commit-config.yaml",
".editorconfig",
".prettierrc*",
".eslintrc*",
".stylelintrc*",
".markdownlint*",
# === Example & fixture files ===
"example-*.yml",
"example-*.yaml",
"example-*.json",
"fixture-*",
"fixtures/",
"examples/",
"samples/",
# === Minified files ===
"*.min.js",
"*.min.css",
"*.min.map",
# === Source maps ===
"*.map",
"*.js.map",
"*.css.map",
}
def matches_pattern(path: str, pattern: str) -> bool:
if pattern.endswith("/"):
# Directory pattern - check if file is within this directory
return path.startswith(pattern) or fnmatch.fnmatch(path + "/", pattern)
else:
# File pattern
return fnmatch.fnmatch(path, pattern)
async def get_codebase_snapshot(
repo_path: Path, just_paths: bool = False, log_function=print, git_command_func=None
) -> tuple[list[str], dict[str, str]]:
"""Get a snapshot of the codebase.
Args:
repo_path: Path to the repository.
just_paths: If True, return only file paths without contents.
log_function: Function to use for logging.
git_command_func: Optional Git command function (for mocking).
Returns:
Tuple of (file_paths, file_contents).
"""
mode_str = "paths" if just_paths else "full"
log_function(f"Getting codebase snapshot in mode: {mode_str}")
# Get the .gitignore patterns
gitignore_patterns = []
gitignore_path = repo_path / ".gitignore"
if gitignore_path.exists():
gitignore_patterns = [
line.strip()
for line in gitignore_path.read_text().strip().split("\n")
if line.strip() and not line.strip().startswith("#")
]
log_function(f"Found .gitignore with {len(gitignore_patterns)} patterns")
# Get tracked files
tracked_files = await run_git_command(repo_path, ["ls-files"], git_command_func)
tracked_file_list = tracked_files.strip().split("\n") if tracked_files else []
# Get untracked files (not ignored by .gitignore)
untracked_files = await run_git_command(
repo_path, ["ls-files", "--others", "--exclude-standard"], git_command_func
)
untracked_file_list = untracked_files.strip().split("\n") if untracked_files else []
# Combine all files
all_files = set(tracked_file_list + untracked_file_list)
# Filter out empty strings
all_files = {f for f in all_files if f}
# Check for additional ignore files (.yellhornignore and .yellhorncontext)
yellhornignore_path = repo_path / ".yellhornignore"
yellhornignore_patterns = []
if yellhornignore_path.exists():
yellhornignore_patterns = [
line.strip()
for line in yellhornignore_path.read_text().strip().split("\n")
if line.strip() and not line.strip().startswith("#")
]
log_function(f"Found .yellhornignore with {len(yellhornignore_patterns)} patterns")
# Parse .yellhorncontext patterns (supports blacklist, whitelist, and negation)
yellhorncontext_path = repo_path / ".yellhorncontext"
context_blacklist_patterns = []
context_whitelist_patterns = []
context_negation_patterns = []
if yellhorncontext_path.exists():
lines = [
line.strip()
for line in yellhorncontext_path.read_text().strip().split("\n")
if line.strip() and not line.strip().startswith("#")
]
# Separate patterns by type
for line in lines:
if line.startswith("!"):
# Blacklist pattern (exclude this directory/file)
context_blacklist_patterns.append(line[1:]) # Remove the '!' prefix
else:
# All other patterns are whitelist (directories/files to include)
context_whitelist_patterns.append(line)
log_function(
f"Found .yellhorncontext with {len(context_whitelist_patterns)} whitelist, "
f"{len(context_blacklist_patterns)} blacklist, and {len(context_negation_patterns)} negation patterns"
)
# Categorize files by priority
# Priority: yellhorncontext whitelist > yellhorncontext blacklist > yellhornignore whitelist > yellhornignore blacklist > gitignore blacklist
# Categories for files
yellhorncontext_whitelist_files = []
yellhorncontext_blacklist_files = []
yellhornignore_whitelist_files = []
yellhornignore_blacklist_files = []
gitignore_blacklist_files = []
other_files = []
always_ignored_count = 0
# Parse .yellhornignore patterns to separate whitelist and blacklist
yellhornignore_whitelist_patterns = []
yellhornignore_blacklist_patterns = []
for pattern in yellhornignore_patterns:
if pattern.startswith("!"):
# Whitelist pattern in yellhornignore (negation)
yellhornignore_whitelist_patterns.append(pattern[1:])
else:
# Blacklist pattern in yellhornignore
yellhornignore_blacklist_patterns.append(pattern)
# Process each file and categorize it (excluding always-ignored files)
for file_path in sorted(all_files):
# Skip files matching always-ignore patterns
should_ignore = False
for pattern in ALWAYS_IGNORE_PATTERNS:
if matches_pattern(file_path, pattern):
should_ignore = True
break
if should_ignore:
always_ignored_count += 1
continue
# Determine which category this file belongs to
is_context_whitelisted = any(
matches_pattern(file_path, p) for p in context_whitelist_patterns
)
is_context_blacklisted = any(
matches_pattern(file_path, p) for p in context_blacklist_patterns
)
is_ignore_whitelisted = any(
matches_pattern(file_path, p) for p in yellhornignore_whitelist_patterns
)
is_ignore_blacklisted = any(
matches_pattern(file_path, p) for p in yellhornignore_blacklist_patterns
)
# If we have context whitelist patterns, only include files that match them
if context_whitelist_patterns and not is_context_whitelisted:
continue # Skip files not in whitelist
# Categorize based on priority
if is_context_whitelisted and not is_context_blacklisted:
yellhorncontext_whitelist_files.append(file_path)
elif is_context_whitelisted and is_context_blacklisted:
yellhorncontext_blacklist_files.append(file_path)
elif is_context_blacklisted:
yellhorncontext_blacklist_files.append(file_path)
elif is_ignore_whitelisted:
yellhornignore_whitelist_files.append(file_path)
elif is_ignore_blacklisted:
yellhornignore_blacklist_files.append(file_path)
else:
# File doesn't match any special patterns
other_files.append(file_path)
# Files to include in priority order (excluding blacklisted files)
# Priority: yellhorncontext whitelist > yellhornignore whitelist > other files (only if no .yellhorncontext)
if yellhorncontext_path.exists():
# If .yellhorncontext exists, only include whitelisted files
files_to_include = yellhorncontext_whitelist_files
else:
# If no .yellhorncontext, include other files as well
files_to_include = yellhornignore_whitelist_files + other_files
# Log filtering results
total_files = len(all_files)
log_function(f"File categorization results out of {total_files} files:")
if always_ignored_count > 0:
log_function(f" - {always_ignored_count} always ignored (images, binaries, configs, etc.)")
log_function(
f" - {len(yellhorncontext_whitelist_files)} in yellhorncontext whitelist (included)"
)
log_function(
f" - {len(yellhorncontext_blacklist_files)} in yellhorncontext blacklist (excluded)"
)
log_function(
f" - {len(yellhornignore_whitelist_files)} in yellhornignore whitelist (included)"
)
log_function(
f" - {len(yellhornignore_blacklist_files)} in yellhornignore blacklist (excluded)"
)
if yellhorncontext_path.exists():
log_function(f" - {len(other_files)} other files (excluded - .yellhorncontext exists)")
else:
log_function(f" - {len(other_files)} other files (included - no .yellhorncontext)")
log_function(
f"Total included: {len(files_to_include)} files (excluded {always_ignored_count} always-ignored files)"
)
# Use the prioritized list of files to include
file_paths = files_to_include
# If just_paths is True, return empty file contents
if just_paths:
return file_paths, {}
# Read file contents for full mode
file_contents = {}
MAX_FILE_SIZE = 1024 * 1024 # 1MB limit per file
skipped_large_files = 0
for file_path in file_paths:
full_path = repo_path / file_path
try:
# Check file size first
if full_path.stat().st_size > MAX_FILE_SIZE:
skipped_large_files += 1
continue
# Try to read as text
content = full_path.read_text(encoding="utf-8", errors="ignore")
file_contents[file_path] = content
except Exception:
# Skip files that can't be read
continue
if skipped_large_files > 0:
log_function(f"Skipped {skipped_large_files} files larger than 1MB")
log_function(f"Read contents of {len(file_contents)} files")
return file_paths, file_contents