205 lines
7.0 KiB
Python
205 lines
7.0 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Analyze Files for Pruning
|
||
|
|
Identifies files that could potentially be removed from the project.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import hashlib
|
||
|
|
from pathlib import Path
|
||
|
|
from collections import defaultdict
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
def analyze_project():
|
||
|
|
"""Analyze project for files that can be pruned."""
|
||
|
|
|
||
|
|
root = Path('.')
|
||
|
|
results = {
|
||
|
|
'temp_files': [],
|
||
|
|
'duplicates': defaultdict(list),
|
||
|
|
'large_files': [],
|
||
|
|
'old_status_files': [],
|
||
|
|
'backup_files': [],
|
||
|
|
'build_artifacts': [],
|
||
|
|
'potentially_obsolete': []
|
||
|
|
}
|
||
|
|
|
||
|
|
# Patterns for files to check
|
||
|
|
temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log']
|
||
|
|
backup_patterns = ['.backup', '.bak', '.old', '.orig']
|
||
|
|
|
||
|
|
# Directories to skip
|
||
|
|
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
|
||
|
|
|
||
|
|
# Check all files
|
||
|
|
for root_dir, dirs, files in os.walk('.'):
|
||
|
|
# Skip certain directories
|
||
|
|
dirs[:] = [d for d in dirs if d not in skip_dirs]
|
||
|
|
|
||
|
|
root_path = Path(root_dir)
|
||
|
|
|
||
|
|
for file in files:
|
||
|
|
file_path = root_path / file
|
||
|
|
|
||
|
|
# Skip if in ignored directory
|
||
|
|
if any(skip in str(file_path) for skip in skip_dirs):
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Check for temp files
|
||
|
|
if any(pattern in file for pattern in temp_patterns):
|
||
|
|
results['temp_files'].append(str(file_path))
|
||
|
|
|
||
|
|
# Check for backup files
|
||
|
|
if any(pattern in file for pattern in backup_patterns):
|
||
|
|
results['backup_files'].append(str(file_path))
|
||
|
|
|
||
|
|
# Check for large files (>5MB)
|
||
|
|
try:
|
||
|
|
size = file_path.stat().st_size
|
||
|
|
if size > 5 * 1024 * 1024: # 5MB
|
||
|
|
results['large_files'].append((str(file_path), size))
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Check for old status/complete files in docs
|
||
|
|
if 'docs' in str(file_path) and file_path.suffix == '.md':
|
||
|
|
file_lower = file.upper()
|
||
|
|
if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']):
|
||
|
|
if 'archive' not in str(file_path) and 'status' in str(file_path):
|
||
|
|
results['old_status_files'].append(str(file_path))
|
||
|
|
|
||
|
|
# Check for potentially obsolete documentation
|
||
|
|
if 'docs' in str(file_path) and file_path.suffix == '.md':
|
||
|
|
file_lower = file.upper()
|
||
|
|
# Files that might be superseded
|
||
|
|
obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED']
|
||
|
|
if any(keyword in file_lower for keyword in obsolete_keywords):
|
||
|
|
results['potentially_obsolete'].append(str(file_path))
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
def find_duplicate_content():
|
||
|
|
"""Find files with duplicate content."""
|
||
|
|
duplicates = defaultdict(list)
|
||
|
|
|
||
|
|
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
|
||
|
|
|
||
|
|
for root_dir, dirs, files in os.walk('.'):
|
||
|
|
dirs[:] = [d for d in dirs if d not in skip_dirs]
|
||
|
|
|
||
|
|
for file in files:
|
||
|
|
if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')):
|
||
|
|
continue
|
||
|
|
|
||
|
|
file_path = Path(root_dir) / file
|
||
|
|
if any(skip in str(file_path) for skip in skip_dirs):
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
with open(file_path, 'rb') as f:
|
||
|
|
content_hash = hashlib.md5(f.read()).hexdigest()
|
||
|
|
duplicates[content_hash].append(str(file_path))
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Filter to only actual duplicates (2+ files)
|
||
|
|
return {h: files for h, files in duplicates.items() if len(files) > 1}
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("="*60)
|
||
|
|
print("FILE PRUNING ANALYSIS")
|
||
|
|
print("="*60)
|
||
|
|
print()
|
||
|
|
|
||
|
|
results = analyze_project()
|
||
|
|
|
||
|
|
print("1. TEMPORARY FILES")
|
||
|
|
print("-" * 60)
|
||
|
|
if results['temp_files']:
|
||
|
|
print(f"Found {len(results['temp_files'])} temporary files:")
|
||
|
|
for f in sorted(results['temp_files'])[:20]:
|
||
|
|
print(f" - {f}")
|
||
|
|
if len(results['temp_files']) > 20:
|
||
|
|
print(f" ... and {len(results['temp_files']) - 20} more")
|
||
|
|
else:
|
||
|
|
print(" No temporary files found")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print("2. BACKUP FILES")
|
||
|
|
print("-" * 60)
|
||
|
|
if results['backup_files']:
|
||
|
|
print(f"Found {len(results['backup_files'])} backup files:")
|
||
|
|
for f in sorted(results['backup_files']):
|
||
|
|
print(f" - {f}")
|
||
|
|
else:
|
||
|
|
print(" No backup files found")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print("3. LARGE FILES (>5MB)")
|
||
|
|
print("-" * 60)
|
||
|
|
if results['large_files']:
|
||
|
|
print(f"Found {len(results['large_files'])} large files:")
|
||
|
|
for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]:
|
||
|
|
size_mb = size / (1024 * 1024)
|
||
|
|
print(f" - {f} ({size_mb:.2f} MB)")
|
||
|
|
else:
|
||
|
|
print(" No unusually large files found")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print("4. OLD STATUS/COMPLETE FILES (outside archive)")
|
||
|
|
print("-" * 60)
|
||
|
|
if results['old_status_files']:
|
||
|
|
print(f"Found {len(results['old_status_files'])} status files that might be archived:")
|
||
|
|
for f in sorted(results['old_status_files']):
|
||
|
|
print(f" - {f}")
|
||
|
|
else:
|
||
|
|
print(" No old status files found outside archive")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print("5. POTENTIALLY OBSOLETE FILES")
|
||
|
|
print("-" * 60)
|
||
|
|
if results['potentially_obsolete']:
|
||
|
|
print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:")
|
||
|
|
for f in sorted(results['potentially_obsolete']):
|
||
|
|
print(f" - {f}")
|
||
|
|
else:
|
||
|
|
print(" No obviously obsolete files found")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print("6. DUPLICATE CONTENT")
|
||
|
|
print("-" * 60)
|
||
|
|
duplicates = find_duplicate_content()
|
||
|
|
if duplicates:
|
||
|
|
print(f"Found {len(duplicates)} groups of duplicate files:")
|
||
|
|
for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1):
|
||
|
|
print(f"\n Group {i} ({len(files)} files):")
|
||
|
|
for f in files:
|
||
|
|
print(f" - {f}")
|
||
|
|
if len(duplicates) > 10:
|
||
|
|
print(f"\n ... and {len(duplicates) - 10} more duplicate groups")
|
||
|
|
else:
|
||
|
|
print(" No duplicate content found")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
total_findings = (
|
||
|
|
len(results['temp_files']) +
|
||
|
|
len(results['backup_files']) +
|
||
|
|
len(results['large_files']) +
|
||
|
|
len(results['old_status_files']) +
|
||
|
|
len(results['potentially_obsolete'])
|
||
|
|
)
|
||
|
|
|
||
|
|
print("="*60)
|
||
|
|
print("SUMMARY")
|
||
|
|
print("="*60)
|
||
|
|
print(f"Total files that could be pruned: {total_findings}")
|
||
|
|
print(f"Duplicate file groups: {len(duplicates)}")
|
||
|
|
print()
|
||
|
|
print("Note: Review each category before deletion.")
|
||
|
|
print("Archive files are intentionally kept for historical reference.")
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|
||
|
|
|