#!/usr/bin/env python3 """ Analyze Files for Pruning Identifies files that could potentially be removed from the project. """ import os import hashlib from pathlib import Path from collections import defaultdict from datetime import datetime def analyze_project(): """Analyze project for files that can be pruned.""" root = Path('.') results = { 'temp_files': [], 'duplicates': defaultdict(list), 'large_files': [], 'old_status_files': [], 'backup_files': [], 'build_artifacts': [], 'potentially_obsolete': [] } # Patterns for files to check temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log'] backup_patterns = ['.backup', '.bak', '.old', '.orig'] # Directories to skip skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'} # Check all files for root_dir, dirs, files in os.walk('.'): # Skip certain directories dirs[:] = [d for d in dirs if d not in skip_dirs] root_path = Path(root_dir) for file in files: file_path = root_path / file # Skip if in ignored directory if any(skip in str(file_path) for skip in skip_dirs): continue # Check for temp files if any(pattern in file for pattern in temp_patterns): results['temp_files'].append(str(file_path)) # Check for backup files if any(pattern in file for pattern in backup_patterns): results['backup_files'].append(str(file_path)) # Check for large files (>5MB) try: size = file_path.stat().st_size if size > 5 * 1024 * 1024: # 5MB results['large_files'].append((str(file_path), size)) except: pass # Check for old status/complete files in docs if 'docs' in str(file_path) and file_path.suffix == '.md': file_lower = file.upper() if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']): if 'archive' not in str(file_path) and 'status' in str(file_path): results['old_status_files'].append(str(file_path)) # Check for potentially obsolete documentation if 'docs' in str(file_path) and file_path.suffix == '.md': file_lower = file.upper() # Files that might be superseded obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED'] if any(keyword in file_lower for keyword in obsolete_keywords): results['potentially_obsolete'].append(str(file_path)) return results def find_duplicate_content(): """Find files with duplicate content.""" duplicates = defaultdict(list) skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'} for root_dir, dirs, files in os.walk('.'): dirs[:] = [d for d in dirs if d not in skip_dirs] for file in files: if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')): continue file_path = Path(root_dir) / file if any(skip in str(file_path) for skip in skip_dirs): continue try: with open(file_path, 'rb') as f: content_hash = hashlib.md5(f.read()).hexdigest() duplicates[content_hash].append(str(file_path)) except: pass # Filter to only actual duplicates (2+ files) return {h: files for h, files in duplicates.items() if len(files) > 1} def main(): print("="*60) print("FILE PRUNING ANALYSIS") print("="*60) print() results = analyze_project() print("1. TEMPORARY FILES") print("-" * 60) if results['temp_files']: print(f"Found {len(results['temp_files'])} temporary files:") for f in sorted(results['temp_files'])[:20]: print(f" - {f}") if len(results['temp_files']) > 20: print(f" ... and {len(results['temp_files']) - 20} more") else: print(" No temporary files found") print() print("2. BACKUP FILES") print("-" * 60) if results['backup_files']: print(f"Found {len(results['backup_files'])} backup files:") for f in sorted(results['backup_files']): print(f" - {f}") else: print(" No backup files found") print() print("3. LARGE FILES (>5MB)") print("-" * 60) if results['large_files']: print(f"Found {len(results['large_files'])} large files:") for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]: size_mb = size / (1024 * 1024) print(f" - {f} ({size_mb:.2f} MB)") else: print(" No unusually large files found") print() print("4. OLD STATUS/COMPLETE FILES (outside archive)") print("-" * 60) if results['old_status_files']: print(f"Found {len(results['old_status_files'])} status files that might be archived:") for f in sorted(results['old_status_files']): print(f" - {f}") else: print(" No old status files found outside archive") print() print("5. POTENTIALLY OBSOLETE FILES") print("-" * 60) if results['potentially_obsolete']: print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:") for f in sorted(results['potentially_obsolete']): print(f" - {f}") else: print(" No obviously obsolete files found") print() print("6. DUPLICATE CONTENT") print("-" * 60) duplicates = find_duplicate_content() if duplicates: print(f"Found {len(duplicates)} groups of duplicate files:") for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1): print(f"\n Group {i} ({len(files)} files):") for f in files: print(f" - {f}") if len(duplicates) > 10: print(f"\n ... and {len(duplicates) - 10} more duplicate groups") else: print(" No duplicate content found") print() # Summary total_findings = ( len(results['temp_files']) + len(results['backup_files']) + len(results['large_files']) + len(results['old_status_files']) + len(results['potentially_obsolete']) ) print("="*60) print("SUMMARY") print("="*60) print(f"Total files that could be pruned: {total_findings}") print(f"Duplicate file groups: {len(duplicates)}") print() print("Note: Review each category before deletion.") print("Archive files are intentionally kept for historical reference.") if __name__ == '__main__': main()