Files
Sankofa/scripts/analyze-files-to-prune.py
defiQUG fe0365757a Update documentation structure and enhance .gitignore
- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files.
- Updated README links to reflect new documentation paths for better navigation.
- Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
2025-12-12 21:18:55 -08:00

205 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
Analyze Files for Pruning
Identifies files that could potentially be removed from the project.
"""
import os
import hashlib
from pathlib import Path
from collections import defaultdict
from datetime import datetime
def analyze_project():
"""Analyze project for files that can be pruned."""
root = Path('.')
results = {
'temp_files': [],
'duplicates': defaultdict(list),
'large_files': [],
'old_status_files': [],
'backup_files': [],
'build_artifacts': [],
'potentially_obsolete': []
}
# Patterns for files to check
temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log']
backup_patterns = ['.backup', '.bak', '.old', '.orig']
# Directories to skip
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
# Check all files
for root_dir, dirs, files in os.walk('.'):
# Skip certain directories
dirs[:] = [d for d in dirs if d not in skip_dirs]
root_path = Path(root_dir)
for file in files:
file_path = root_path / file
# Skip if in ignored directory
if any(skip in str(file_path) for skip in skip_dirs):
continue
# Check for temp files
if any(pattern in file for pattern in temp_patterns):
results['temp_files'].append(str(file_path))
# Check for backup files
if any(pattern in file for pattern in backup_patterns):
results['backup_files'].append(str(file_path))
# Check for large files (>5MB)
try:
size = file_path.stat().st_size
if size > 5 * 1024 * 1024: # 5MB
results['large_files'].append((str(file_path), size))
except:
pass
# Check for old status/complete files in docs
if 'docs' in str(file_path) and file_path.suffix == '.md':
file_lower = file.upper()
if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']):
if 'archive' not in str(file_path) and 'status' in str(file_path):
results['old_status_files'].append(str(file_path))
# Check for potentially obsolete documentation
if 'docs' in str(file_path) and file_path.suffix == '.md':
file_lower = file.upper()
# Files that might be superseded
obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED']
if any(keyword in file_lower for keyword in obsolete_keywords):
results['potentially_obsolete'].append(str(file_path))
return results
def find_duplicate_content():
"""Find files with duplicate content."""
duplicates = defaultdict(list)
skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}
for root_dir, dirs, files in os.walk('.'):
dirs[:] = [d for d in dirs if d not in skip_dirs]
for file in files:
if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')):
continue
file_path = Path(root_dir) / file
if any(skip in str(file_path) for skip in skip_dirs):
continue
try:
with open(file_path, 'rb') as f:
content_hash = hashlib.md5(f.read()).hexdigest()
duplicates[content_hash].append(str(file_path))
except:
pass
# Filter to only actual duplicates (2+ files)
return {h: files for h, files in duplicates.items() if len(files) > 1}
def main():
print("="*60)
print("FILE PRUNING ANALYSIS")
print("="*60)
print()
results = analyze_project()
print("1. TEMPORARY FILES")
print("-" * 60)
if results['temp_files']:
print(f"Found {len(results['temp_files'])} temporary files:")
for f in sorted(results['temp_files'])[:20]:
print(f" - {f}")
if len(results['temp_files']) > 20:
print(f" ... and {len(results['temp_files']) - 20} more")
else:
print(" No temporary files found")
print()
print("2. BACKUP FILES")
print("-" * 60)
if results['backup_files']:
print(f"Found {len(results['backup_files'])} backup files:")
for f in sorted(results['backup_files']):
print(f" - {f}")
else:
print(" No backup files found")
print()
print("3. LARGE FILES (>5MB)")
print("-" * 60)
if results['large_files']:
print(f"Found {len(results['large_files'])} large files:")
for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]:
size_mb = size / (1024 * 1024)
print(f" - {f} ({size_mb:.2f} MB)")
else:
print(" No unusually large files found")
print()
print("4. OLD STATUS/COMPLETE FILES (outside archive)")
print("-" * 60)
if results['old_status_files']:
print(f"Found {len(results['old_status_files'])} status files that might be archived:")
for f in sorted(results['old_status_files']):
print(f" - {f}")
else:
print(" No old status files found outside archive")
print()
print("5. POTENTIALLY OBSOLETE FILES")
print("-" * 60)
if results['potentially_obsolete']:
print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:")
for f in sorted(results['potentially_obsolete']):
print(f" - {f}")
else:
print(" No obviously obsolete files found")
print()
print("6. DUPLICATE CONTENT")
print("-" * 60)
duplicates = find_duplicate_content()
if duplicates:
print(f"Found {len(duplicates)} groups of duplicate files:")
for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1):
print(f"\n Group {i} ({len(files)} files):")
for f in files:
print(f" - {f}")
if len(duplicates) > 10:
print(f"\n ... and {len(duplicates) - 10} more duplicate groups")
else:
print(" No duplicate content found")
print()
# Summary
total_findings = (
len(results['temp_files']) +
len(results['backup_files']) +
len(results['large_files']) +
len(results['old_status_files']) +
len(results['potentially_obsolete'])
)
print("="*60)
print("SUMMARY")
print("="*60)
print(f"Total files that could be pruned: {total_findings}")
print(f"Duplicate file groups: {len(duplicates)}")
print()
print("Note: Review each category before deletion.")
print("Archive files are intentionally kept for historical reference.")
if __name__ == '__main__':
main()