import glob
import os
from pathlib import Path, PurePosixPath
from typing import Dict, List, Optional, Set
def read_paths(
repo_root: Path,
path_patterns: List[str],
max_file_bytes: int = 200_000,
max_total_bytes: int = 2_000_000,
include_extensions: Optional[List[str]] = None,
exclude_globs: Optional[List[str]] = None,
) -> Dict[str, str]:
"""
Read a bounded amount of repo text into a single context dict.
Enforces repo boundary and limits.
"""
# Default excludes if none provided
if exclude_globs is None:
exclude_globs = [
".git/**", "**/ .git/**", "node_modules/**", "**/node_modules/**",
"target/**", "**/target/**", "dist/**", "**/dist/**",
"**/__pycache__/**", "**/.DS_Store", "**/*.pyc"
]
total_bytes = 0
out: Dict[str, str] = {}
# Resolve repo root once
repo_root = repo_root.resolve()
# Process each pattern
processed_paths: Set[Path] = set()
for pattern in path_patterns:
# Construct full glob pattern
full_pattern = str(repo_root / pattern)
recursive = "**" in pattern
matched_paths = glob.glob(full_pattern, recursive=recursive)
for p in matched_paths:
fp = Path(p).resolve()
# 1. Enforce repo boundary
try:
fp.relative_to(repo_root)
except ValueError:
continue
if not fp.is_file() or fp in processed_paths:
continue
# 2. Check excludes (using POSIX path for matching)
rel_path = fp.relative_to(repo_root)
posix_rel = str(PurePosixPath(rel_path))
if any(PurePosixPath(posix_rel).match(exc) for exc in exclude_globs):
continue
# 3. Check extensions
if include_extensions:
if fp.suffix not in include_extensions:
continue
# 4. Check size constraints
try:
stat = fp.stat()
size = stat.st_size
if size > max_file_bytes:
continue
if total_bytes + size > max_total_bytes:
break
# Read content
text = fp.read_text(encoding="utf-8", errors="replace")
out[str(rel_path)] = text
# Count bytes as length of text in utf-8
read_size = len(text.encode("utf-8"))
total_bytes += read_size
processed_paths.add(fp)
except Exception:
continue
if total_bytes >= max_total_bytes:
break
return out