find_orphaned_files.py•12.2 kB
#!/usr/bin/env python3
"""
Orphaned File Detection Script
Finds files and directories that may be unused, redundant, or orphaned in the repository.
This helps maintain a lean and clean codebase by identifying cleanup candidates.
Usage:
    python scripts/find_orphaned_files.py
    python scripts/find_orphaned_files.py --include-safe-files
    python scripts/find_orphaned_files.py --verbose
"""
import os
import re
import argparse
from pathlib import Path
from typing import Set, List, Dict, Tuple
from collections import defaultdict
class OrphanDetector:
    def __init__(self, repo_root: Path, include_safe_files: bool = False, verbose: bool = False):
        self.repo_root = repo_root
        self.include_safe_files = include_safe_files
        self.verbose = verbose
        
        # Files/dirs to always ignore
        self.ignore_patterns = {
            '.git', '.venv', '__pycache__', '.pytest_cache', 'node_modules',
            '.DS_Store', '.gitignore', '.gitattributes', 'LICENSE', 'CHANGELOG.md',
            '*.pyc', '*.pyo', '*.egg-info', 'dist', 'build'
        }
        
        # Safe files that are commonly unreferenced but important
        self.safe_files = {
            'README.md', 'pyproject.toml', 'uv.lock', 'setup.py', 'requirements.txt',
            'Dockerfile', 'docker-compose.yml', '.dockerignore', 'Makefile',
            '__init__.py', 'main.py', 'server.py', 'config.py', 'settings.py'
        }
        
        # Extensions that are likely to be referenced
        self.code_extensions = {'.py', '.js', '.ts', '.sh', '.md', '.yml', '.yaml', '.json'}
        
    def should_ignore(self, path: Path) -> bool:
        """Check if a path should be ignored."""
        path_str = str(path)
        for pattern in self.ignore_patterns:
            if pattern in path_str or path.name == pattern:
                return True
        return False
    
    def is_safe_file(self, path: Path) -> bool:
        """Check if a file is considered 'safe' (commonly unreferenced but important)."""
        return path.name in self.safe_files
    
    def find_all_files(self) -> List[Path]:
        """Find all files in the repository."""
        all_files = []
        for root, dirs, files in os.walk(self.repo_root):
            # Remove ignored directories from dirs list to skip them
            dirs[:] = [d for d in dirs if not any(ignore in d for ignore in self.ignore_patterns)]
            
            for file in files:
                file_path = Path(root) / file
                if not self.should_ignore(file_path):
                    all_files.append(file_path)
        
        return all_files
    
    def extract_references(self, file_path: Path) -> Set[str]:
        """Extract potential file references from a file."""
        references = set()
        
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                
            # Find various types of references
            patterns = [
                # Python imports: from module import, import module
                r'(?:from\s+|import\s+)([a-zA-Z_][a-zA-Z0-9_.]*)',
                # File paths in quotes
                r'["\']([^"\']*\.[a-zA-Z0-9]+)["\']',
                # Common file references
                r'([a-zA-Z_][a-zA-Z0-9_.-]*\.[a-zA-Z0-9]+)',
                # Directory references
                r'([a-zA-Z_][a-zA-Z0-9_-]*/)(?:[a-zA-Z0-9_.-]+)',
            ]
            
            for pattern in patterns:
                matches = re.findall(pattern, content, re.MULTILINE)
                references.update(matches)
                
        except Exception as e:
            if self.verbose:
                print(f"Warning: Could not read {file_path}: {e}")
                
        return references
    
    def build_reference_map(self, files: List[Path]) -> Dict[str, Set[Path]]:
        """Build a map of what files reference what."""
        reference_map = defaultdict(set)
        
        for file_path in files:
            if file_path.suffix in self.code_extensions:
                references = self.extract_references(file_path)
                for ref in references:
                    reference_map[ref].add(file_path)
                    
        return reference_map
    
    def find_orphaned_files(self) -> Tuple[List[Path], List[Path], List[Path]]:
        """Find potentially orphaned files."""
        all_files = self.find_all_files()
        reference_map = self.build_reference_map(all_files)
        
        # Convert file paths to strings for easier matching
        file_names = {f.name for f in all_files}
        file_stems = {f.stem for f in all_files}
        file_paths = {str(f.relative_to(self.repo_root)) for f in all_files}
        
        potentially_orphaned = []
        safe_unreferenced = []
        directories_to_check = []
        
        for file_path in all_files:
            rel_path = file_path.relative_to(self.repo_root)
            file_name = file_path.name
            file_stem = file_path.stem
            
            # Check if file is referenced
            is_referenced = False
            
            # Check various forms of references
            reference_forms = [
                file_name,
                file_stem,
                str(rel_path),
                str(rel_path).replace('/', '.'),  # Python module style
                file_stem.replace('_', '-'),      # kebab-case variants
                file_stem.replace('-', '_'),      # snake_case variants
            ]
            
            for form in reference_forms:
                if form in reference_map and reference_map[form]:
                    is_referenced = True
                    break
            
            # Special checks for Python files
            if file_path.suffix == '.py':
                # Check if it's imported as a module
                module_path = str(rel_path).replace('/', '.').replace('.py', '')
                if module_path in reference_map:
                    is_referenced = True
            
            # Categorize unreferenced files
            if not is_referenced:
                if self.is_safe_file(file_path) and not self.include_safe_files:
                    safe_unreferenced.append(file_path)
                else:
                    potentially_orphaned.append(file_path)
        
        # Check for empty directories
        for root, dirs, files in os.walk(self.repo_root):
            dirs[:] = [d for d in dirs if not any(ignore in d for ignore in self.ignore_patterns)]
            
            if not dirs and not files:  # Empty directory
                empty_dir = Path(root)
                if not self.should_ignore(empty_dir):
                    directories_to_check.append(empty_dir)
        
        return potentially_orphaned, safe_unreferenced, directories_to_check
    
    def find_duplicate_files(self) -> Dict[str, List[Path]]:
        """Find files with identical names that might be duplicates."""
        all_files = self.find_all_files()
        name_groups = defaultdict(list)
        
        for file_path in all_files:
            name_groups[file_path.name].append(file_path)
        
        # Only return groups with multiple files
        return {name: paths for name, paths in name_groups.items() if len(paths) > 1}
    
    def analyze_config_files(self) -> List[Tuple[Path, str]]:
        """Find potentially redundant configuration files."""
        all_files = self.find_all_files()
        config_files = []
        
        config_patterns = [
            (r'.*requirements.*\.txt$', 'Requirements file'),
            (r'.*requirements.*\.lock$', 'Requirements lock'),
            (r'.*package.*\.json$', 'Package.json'),
            (r'.*package.*lock.*\.json$', 'Package lock'),
            (r'.*\.lock$', 'Lock file'),
            (r'.*config.*\.(py|json|yaml|yml)$', 'Config file'),
            (r'.*settings.*\.(py|json|yaml|yml)$', 'Settings file'),
            (r'.*\.env.*', 'Environment file'),
        ]
        
        for file_path in all_files:
            rel_path = str(file_path.relative_to(self.repo_root))
            for pattern, description in config_patterns:
                if re.match(pattern, rel_path, re.IGNORECASE):
                    config_files.append((file_path, description))
                    break
                    
        return config_files
    
    def generate_report(self):
        """Generate a comprehensive orphan detection report."""
        print("🔍 ORPHANED FILE DETECTION REPORT")
        print("=" * 60)
        
        orphaned, safe_unreferenced, empty_dirs = self.find_orphaned_files()
        duplicates = self.find_duplicate_files()
        config_files = self.analyze_config_files()
        
        # Potentially orphaned files
        if orphaned:
            print(f"\n❌ POTENTIALLY ORPHANED FILES ({len(orphaned)}):")
            for file_path in sorted(orphaned):
                rel_path = file_path.relative_to(self.repo_root)
                print(f"  📄 {rel_path}")
        else:
            print(f"\n✅ No potentially orphaned files found!")
        
        # Safe unreferenced files (if requested)
        if self.include_safe_files and safe_unreferenced:
            print(f"\n🟡 SAFE UNREFERENCED FILES ({len(safe_unreferenced)}):")
            print("   (These are commonly unreferenced but usually important)")
            for file_path in sorted(safe_unreferenced):
                rel_path = file_path.relative_to(self.repo_root)
                print(f"  📄 {rel_path}")
        
        # Empty directories
        if empty_dirs:
            print(f"\n📁 EMPTY DIRECTORIES ({len(empty_dirs)}):")
            for dir_path in sorted(empty_dirs):
                rel_path = dir_path.relative_to(self.repo_root)
                print(f"  📁 {rel_path}")
        
        # Duplicate file names
        if duplicates:
            print(f"\n👥 DUPLICATE FILE NAMES ({len(duplicates)} groups):")
            for name, paths in sorted(duplicates.items()):
                print(f"  📄 {name}:")
                for path in sorted(paths):
                    rel_path = path.relative_to(self.repo_root)
                    print(f"    - {rel_path}")
        
        # Configuration files analysis
        if config_files:
            print(f"\n⚙️  CONFIGURATION FILES ({len(config_files)}):")
            print("   (Review for redundancy)")
            config_by_type = defaultdict(list)
            for path, desc in config_files:
                config_by_type[desc].append(path)
            
            for desc, paths in sorted(config_by_type.items()):
                print(f"  {desc}:")
                for path in sorted(paths):
                    rel_path = path.relative_to(self.repo_root)
                    print(f"    - {rel_path}")
        
        print(f"\n" + "=" * 60)
        print(f"📊 SUMMARY:")
        print(f"Potentially orphaned files: {len(orphaned)}")
        print(f"Empty directories: {len(empty_dirs)}")
        print(f"Duplicate name groups: {len(duplicates)}")
        print(f"Configuration files: {len(config_files)}")
        
        if orphaned or empty_dirs:
            print(f"\n⚠️  Review these files carefully before deletion!")
            print(f"Some may be important despite not being directly referenced.")
        else:
            print(f"\n✅ Repository appears clean with no obvious orphans!")
def main():
    parser = argparse.ArgumentParser(description='Find orphaned files in the repository')
    parser.add_argument('--include-safe-files', '-s', action='store_true', 
                       help='Include commonly unreferenced but safe files in report')
    parser.add_argument('--verbose', '-v', action='store_true', 
                       help='Show verbose output including warnings')
    
    args = parser.parse_args()
    
    repo_root = Path(__file__).parent.parent
    detector = OrphanDetector(repo_root, args.include_safe_files, args.verbose)
    detector.generate_report()
if __name__ == "__main__":
    main()