Files
proxmox/scripts/analyze-markdown-files.py

366 lines
13 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
Comprehensive Markdown File Analysis Script
Analyzes all markdown files in the project for:
- File dates (creation, modification)
- Duplicate patterns
- Misplaced files
- Content inconsistencies
- Outdated information
"""
import os
import re
import json
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from typing import Dict, List, Tuple, Set
import hashlib
class MarkdownAnalyzer:
def __init__(self, root_dir: str):
self.root_dir = Path(root_dir)
self.files = []
self.duplicates = defaultdict(list)
self.patterns = {
'complete': [],
'final': [],
'status': [],
'timestamped': [],
'fix': [],
'report': [],
'temporary': []
}
self.misplaced = []
self.content_hashes = {}
self.file_metadata = []
def analyze(self):
"""Run full analysis"""
print("🔍 Scanning markdown files...")
self._scan_files()
print(f"📊 Found {len(self.files)} markdown files")
print("\n📅 Analyzing file dates...")
self._analyze_dates()
print("\n🔎 Identifying patterns...")
self._identify_patterns()
print("\n📍 Finding misplaced files...")
self._find_misplaced()
print("\n🔗 Checking for duplicates...")
self._check_duplicates()
print("\n📝 Analyzing content...")
self._analyze_content()
return self._generate_report()
def _scan_files(self):
"""Scan for all markdown files"""
exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv', '.venv'}
for md_file in self.root_dir.rglob('*.md'):
# Skip excluded directories
if any(part in exclude_dirs for part in md_file.parts):
continue
try:
stat = md_file.stat()
rel_path = md_file.relative_to(self.root_dir)
self.files.append({
'path': str(rel_path),
'full_path': str(md_file),
'size': stat.st_size,
'modified': datetime.fromtimestamp(stat.st_mtime),
'accessed': datetime.fromtimestamp(stat.st_atime),
'created': datetime.fromtimestamp(stat.st_ctime) if hasattr(stat, 'st_birthtime') else None,
'directory': str(rel_path.parent),
'name': md_file.name
})
except (OSError, PermissionError) as e:
print(f"⚠️ Error accessing {md_file}: {e}")
def _analyze_dates(self):
"""Analyze file modification dates"""
now = datetime.now()
for file_info in self.files:
modified = file_info['modified']
days_old = (now - modified).days
file_info['days_old'] = days_old
file_info['age_category'] = (
'recent' if days_old < 7 else
'recent' if days_old < 30 else
'moderate' if days_old < 90 else
'old' if days_old < 365 else
'very_old'
)
def _identify_patterns(self):
"""Identify files by naming patterns"""
patterns = {
'complete': re.compile(r'COMPLETE', re.I),
'final': re.compile(r'FINAL', re.I),
'status': re.compile(r'STATUS', re.I),
'timestamped': re.compile(r'_\d{8}_\d{6}|\d{8}_\d{6}'),
'fix': re.compile(r'FIX|QUICK_FIX|RUN_NOW|EXECUTE', re.I),
'report': re.compile(r'REPORT|SUMMARY|ANALYSIS|DIAGNOSTIC', re.I),
'temporary': re.compile(r'NOW|READY|EXECUTE|RUN_', re.I)
}
for file_info in self.files:
name = file_info['name']
for pattern_name, pattern in patterns.items():
if pattern.search(name):
self.patterns[pattern_name].append(file_info)
file_info[f'has_{pattern_name}'] = True
def _find_misplaced(self):
"""Find files in wrong locations"""
root_files = [f for f in self.files if f['directory'] == '.']
docs_files = [f for f in self.files if f['directory'].startswith('docs')]
reports_files = [f for f in self.files if f['directory'].startswith('reports')]
# Reports in root
for f in root_files:
if any(keyword in f['name'].upper() for keyword in ['REPORT', 'STATUS', 'INVENTORY', 'DIAGNOSTIC', 'ANALYSIS']):
if not f['name'] in ['README.md', 'PROJECT_STRUCTURE.md']:
self.misplaced.append({
'file': f,
'current': 'root',
'should_be': 'reports/',
'reason': 'Report file in root directory'
})
# Status/completion files in docs
for f in docs_files:
if any(keyword in f['name'].upper() for keyword in ['COMPLETE', 'FINAL', 'STATUS', 'MIGRATION_COMPLETE']):
self.misplaced.append({
'file': f,
'current': f['directory'],
'should_be': 'reports/',
'reason': 'Status/completion report in docs directory'
})
# Temporary fix guides in project root
for f in root_files:
if any(keyword in f['name'].upper() for keyword in ['FIX_', 'QUICK_FIX', 'RUN_NOW', 'EXECUTE']):
self.misplaced.append({
'file': f,
'current': 'root',
'should_be': 'docs/09-troubleshooting/archive/',
'reason': 'Temporary fix guide in root'
})
def _check_duplicates(self):
"""Check for duplicate content"""
for file_info in self.files:
try:
with open(file_info['full_path'], 'rb') as f:
content_hash = hashlib.md5(f.read()).hexdigest()
if content_hash in self.content_hashes:
self.duplicates[content_hash].append(file_info)
else:
self.content_hashes[content_hash] = [file_info]
except Exception as e:
pass
def _analyze_content(self):
"""Analyze file content for issues"""
for file_info in self.files:
try:
with open(file_info['full_path'], 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
file_info['line_count'] = len(lines)
file_info['has_todo'] = 'TODO' in content or 'FIXME' in content
file_info['has_deprecated'] = 'DEPRECATED' in content or 'OBSOLETE' in content
file_info['has_date'] = bool(re.search(r'\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}', content))
# Check for placeholder dates
if re.search(r'\$\(date\)|date \+', content):
file_info['has_placeholder_date'] = True
except Exception as e:
file_info['line_count'] = 0
file_info['error'] = str(e)
def _generate_report(self) -> Dict:
"""Generate comprehensive report"""
report = {
'summary': {
'total_files': len(self.files),
'total_size_mb': sum(f['size'] for f in self.files) / (1024 * 1024),
'by_age': defaultdict(int),
'by_directory': defaultdict(int)
},
'patterns': {},
'misplaced': [],
'duplicates': [],
'old_files': [],
'empty_files': [],
'issues': []
}
# Summary stats
for f in self.files:
report['summary']['by_age'][f['age_category']] += 1
report['summary']['by_directory'][f['directory']] += 1
# Pattern counts
for pattern_name, files in self.patterns.items():
report['patterns'][pattern_name] = {
'count': len(files),
'files': [f['path'] for f in files[:20]] # Limit to 20
}
# Misplaced files
report['misplaced'] = [
{
'path': m['file']['path'],
'current': m['current'],
'should_be': m['should_be'],
'reason': m['reason']
}
for m in self.misplaced
]
# Duplicate content
for hash_val, files in self.duplicates.items():
if len(files) > 1:
report['duplicates'].append({
'hash': hash_val[:8],
'count': len(files),
'files': [f['path'] for f in files]
})
# Old files (>90 days)
report['old_files'] = [
{
'path': f['path'],
'days_old': f['days_old'],
'modified': f['modified'].isoformat()
}
for f in self.files if f['days_old'] > 90
]
# Empty or very small files
report['empty_files'] = [
{
'path': f['path'],
'size': f['size'],
'line_count': f.get('line_count', 0)
}
for f in self.files if f['size'] < 100 or f.get('line_count', 0) < 5
]
# Issues
for f in self.files:
issues = []
if f.get('has_placeholder_date'):
issues.append('Contains placeholder date')
if f.get('has_deprecated'):
issues.append('Marks itself as deprecated')
if f['days_old'] > 365:
issues.append('Very old (>1 year)')
if f['size'] < 50:
issues.append('Very small file')
if issues:
report['issues'].append({
'path': f['path'],
'issues': issues
})
return report
def main():
root_dir = Path(__file__).parent.parent
analyzer = MarkdownAnalyzer(root_dir)
report = analyzer.analyze()
# Save JSON report
json_file = root_dir / 'MARKDOWN_ANALYSIS.json'
with open(json_file, 'w') as f:
json.dump(report, f, indent=2, default=str)
print(f"\n✅ JSON report saved to: {json_file}")
# Generate markdown report
md_file = root_dir / 'MARKDOWN_ANALYSIS_REPORT.md'
with open(md_file, 'w') as f:
f.write(generate_markdown_report(report))
print(f"✅ Markdown report saved to: {md_file}")
return report
def generate_markdown_report(report: Dict) -> str:
"""Generate human-readable markdown report"""
md = []
md.append("# Markdown Files Analysis Report\n")
md.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
# Summary
md.append("## Summary\n")
md.append(f"- **Total Files**: {report['summary']['total_files']}")
md.append(f"- **Total Size**: {report['summary']['total_size_mb']:.2f} MB\n")
md.append("### Files by Age\n")
for age, count in sorted(report['summary']['by_age'].items()):
md.append(f"- **{age.title()}**: {count}")
md.append("")
# Patterns
md.append("## File Patterns\n")
for pattern_name, data in report['patterns'].items():
md.append(f"### {pattern_name.title()} ({data['count']} files)\n")
for file_path in data['files'][:10]:
md.append(f"- `{file_path}`")
if data['count'] > 10:
md.append(f"- ... and {data['count'] - 10} more")
md.append("")
# Misplaced files
md.append("## Misplaced Files\n")
md.append(f"Found **{len(report['misplaced'])}** misplaced files:\n")
for m in report['misplaced'][:50]:
md.append(f"- **{m['path']}**")
md.append(f" - Current: `{m['current']}`")
md.append(f" - Should be: `{m['should_be']}`")
md.append(f" - Reason: {m['reason']}\n")
# Duplicates
md.append("## Duplicate Content\n")
md.append(f"Found **{len(report['duplicates'])}** sets of duplicate files:\n")
for dup in report['duplicates'][:20]:
md.append(f"- **{dup['count']} files** with same content:")
for file_path in dup['files']:
md.append(f" - `{file_path}`")
md.append("")
# Old files
md.append("## Old Files (>90 days)\n")
md.append(f"Found **{len(report['old_files'])}** old files:\n")
for f in sorted(report['old_files'], key=lambda x: x['days_old'], reverse=True)[:50]:
md.append(f"- **{f['path']}** ({f['days_old']} days old, modified: {f['modified'][:10]})")
md.append("")
# Issues
md.append("## Files with Issues\n")
md.append(f"Found **{len(report['issues'])}** files with issues:\n")
for issue in report['issues'][:50]:
md.append(f"- **{issue['path']}**")
for i in issue['issues']:
md.append(f" - {i}")
md.append("")
return "\n".join(md)
if __name__ == '__main__':
main()