Sankofa/scripts/analyze-files-to-prune.py

#!/usr/bin/env python3
"""
Analyze Files for Pruning
Identifies files that could potentially be removed from the project.
"""

import os
import hashlib
from pathlib import Path
from collections import defaultdict
from datetime import datetime

def analyze_project():
    """Analyze project for files that can be pruned."""

    root = Path('.')
    results = {
        'temp_files': [],
        'duplicates': defaultdict(list),
        'large_files': [],
        'old_status_files': [],
        'backup_files': [],
        'build_artifacts': [],
        'potentially_obsolete': []
    }

    # Patterns for files to check
    temp_patterns = ['.tmp', '.swp', '.swo', '~', '.DS_Store', '.log']
    backup_patterns = ['.backup', '.bak', '.old', '.orig']

    # Directories to skip
    skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}

    # Check all files
    for root_dir, dirs, files in os.walk('.'):
        # Skip certain directories
        dirs[:] = [d for d in dirs if d not in skip_dirs]

        root_path = Path(root_dir)

        for file in files:
            file_path = root_path / file

            # Skip if in ignored directory
            if any(skip in str(file_path) for skip in skip_dirs):
                continue

            # Check for temp files
            if any(pattern in file for pattern in temp_patterns):
                results['temp_files'].append(str(file_path))

            # Check for backup files
            if any(pattern in file for pattern in backup_patterns):
                results['backup_files'].append(str(file_path))

            # Check for large files (>5MB)
            try:
                size = file_path.stat().st_size
                if size > 5 * 1024 * 1024:  # 5MB
                    results['large_files'].append((str(file_path), size))
            except:
                pass

            # Check for old status/complete files in docs
            if 'docs' in str(file_path) and file_path.suffix == '.md':
                file_lower = file.upper()
                if any(keyword in file_lower for keyword in ['COMPLETE', 'COMPLETION', 'FINAL_STATUS', 'ALL_STEPS_COMPLETE']):
                    if 'archive' not in str(file_path) and 'status' in str(file_path):
                        results['old_status_files'].append(str(file_path))

            # Check for potentially obsolete documentation
            if 'docs' in str(file_path) and file_path.suffix == '.md':
                file_lower = file.upper()
                # Files that might be superseded
                obsolete_keywords = ['OLD_', 'DEPRECATED', 'LEGACY', 'UNUSED']
                if any(keyword in file_lower for keyword in obsolete_keywords):
                    results['potentially_obsolete'].append(str(file_path))

    return results

def find_duplicate_content():
    """Find files with duplicate content."""
    duplicates = defaultdict(list)

    skip_dirs = {'.git', 'node_modules', 'dist', 'build', '.next', 'coverage', '__pycache__'}

    for root_dir, dirs, files in os.walk('.'):
        dirs[:] = [d for d in dirs if d not in skip_dirs]

        for file in files:
            if not file.endswith(('.md', '.json', '.yaml', '.yml', '.txt')):
                continue

            file_path = Path(root_dir) / file
            if any(skip in str(file_path) for skip in skip_dirs):
                continue

            try:
                with open(file_path, 'rb') as f:
                    content_hash = hashlib.md5(f.read()).hexdigest()
                    duplicates[content_hash].append(str(file_path))
            except:
                pass

    # Filter to only actual duplicates (2+ files)
    return {h: files for h, files in duplicates.items() if len(files) > 1}

def main():
    print("="*60)
    print("FILE PRUNING ANALYSIS")
    print("="*60)
    print()

    results = analyze_project()

    print("1. TEMPORARY FILES")
    print("-" * 60)
    if results['temp_files']:
        print(f"Found {len(results['temp_files'])} temporary files:")
        for f in sorted(results['temp_files'])[:20]:
            print(f"  - {f}")
        if len(results['temp_files']) > 20:
            print(f"  ... and {len(results['temp_files']) - 20} more")
    else:
        print("  No temporary files found")
    print()

    print("2. BACKUP FILES")
    print("-" * 60)
    if results['backup_files']:
        print(f"Found {len(results['backup_files'])} backup files:")
        for f in sorted(results['backup_files']):
            print(f"  - {f}")
    else:
        print("  No backup files found")
    print()

    print("3. LARGE FILES (>5MB)")
    print("-" * 60)
    if results['large_files']:
        print(f"Found {len(results['large_files'])} large files:")
        for f, size in sorted(results['large_files'], key=lambda x: x[1], reverse=True)[:10]:
            size_mb = size / (1024 * 1024)
            print(f"  - {f} ({size_mb:.2f} MB)")
    else:
        print("  No unusually large files found")
    print()

    print("4. OLD STATUS/COMPLETE FILES (outside archive)")
    print("-" * 60)
    if results['old_status_files']:
        print(f"Found {len(results['old_status_files'])} status files that might be archived:")
        for f in sorted(results['old_status_files']):
            print(f"  - {f}")
    else:
        print("  No old status files found outside archive")
    print()

    print("5. POTENTIALLY OBSOLETE FILES")
    print("-" * 60)
    if results['potentially_obsolete']:
        print(f"Found {len(results['potentially_obsolete'])} potentially obsolete files:")
        for f in sorted(results['potentially_obsolete']):
            print(f"  - {f}")
    else:
        print("  No obviously obsolete files found")
    print()

    print("6. DUPLICATE CONTENT")
    print("-" * 60)
    duplicates = find_duplicate_content()
    if duplicates:
        print(f"Found {len(duplicates)} groups of duplicate files:")
        for i, (hash_val, files) in enumerate(list(duplicates.items())[:10], 1):
            print(f"\n  Group {i} ({len(files)} files):")
            for f in files:
                print(f"    - {f}")
        if len(duplicates) > 10:
            print(f"\n  ... and {len(duplicates) - 10} more duplicate groups")
    else:
        print("  No duplicate content found")
    print()

    # Summary
    total_findings = (
        len(results['temp_files']) +
        len(results['backup_files']) +
        len(results['large_files']) +
        len(results['old_status_files']) +
        len(results['potentially_obsolete'])
    )

    print("="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total files that could be pruned: {total_findings}")
    print(f"Duplicate file groups: {len(duplicates)}")
    print()
    print("Note: Review each category before deletion.")
    print("Archive files are intentionally kept for historical reference.")

if __name__ == '__main__':
    main()