366 lines
13 KiB
Python
366 lines
13 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Comprehensive Markdown File Analysis Script
|
||
|
|
Analyzes all markdown files in the project for:
|
||
|
|
- File dates (creation, modification)
|
||
|
|
- Duplicate patterns
|
||
|
|
- Misplaced files
|
||
|
|
- Content inconsistencies
|
||
|
|
- Outdated information
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import json
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
from collections import defaultdict
|
||
|
|
from typing import Dict, List, Tuple, Set
|
||
|
|
import hashlib
|
||
|
|
|
||
|
|
class MarkdownAnalyzer:
|
||
|
|
def __init__(self, root_dir: str):
|
||
|
|
self.root_dir = Path(root_dir)
|
||
|
|
self.files = []
|
||
|
|
self.duplicates = defaultdict(list)
|
||
|
|
self.patterns = {
|
||
|
|
'complete': [],
|
||
|
|
'final': [],
|
||
|
|
'status': [],
|
||
|
|
'timestamped': [],
|
||
|
|
'fix': [],
|
||
|
|
'report': [],
|
||
|
|
'temporary': []
|
||
|
|
}
|
||
|
|
self.misplaced = []
|
||
|
|
self.content_hashes = {}
|
||
|
|
self.file_metadata = []
|
||
|
|
|
||
|
|
def analyze(self):
|
||
|
|
"""Run full analysis"""
|
||
|
|
print("🔍 Scanning markdown files...")
|
||
|
|
self._scan_files()
|
||
|
|
print(f"📊 Found {len(self.files)} markdown files")
|
||
|
|
|
||
|
|
print("\n📅 Analyzing file dates...")
|
||
|
|
self._analyze_dates()
|
||
|
|
|
||
|
|
print("\n🔎 Identifying patterns...")
|
||
|
|
self._identify_patterns()
|
||
|
|
|
||
|
|
print("\n📍 Finding misplaced files...")
|
||
|
|
self._find_misplaced()
|
||
|
|
|
||
|
|
print("\n🔗 Checking for duplicates...")
|
||
|
|
self._check_duplicates()
|
||
|
|
|
||
|
|
print("\n📝 Analyzing content...")
|
||
|
|
self._analyze_content()
|
||
|
|
|
||
|
|
return self._generate_report()
|
||
|
|
|
||
|
|
def _scan_files(self):
|
||
|
|
"""Scan for all markdown files"""
|
||
|
|
exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv', '.venv'}
|
||
|
|
|
||
|
|
for md_file in self.root_dir.rglob('*.md'):
|
||
|
|
# Skip excluded directories
|
||
|
|
if any(part in exclude_dirs for part in md_file.parts):
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
stat = md_file.stat()
|
||
|
|
rel_path = md_file.relative_to(self.root_dir)
|
||
|
|
|
||
|
|
self.files.append({
|
||
|
|
'path': str(rel_path),
|
||
|
|
'full_path': str(md_file),
|
||
|
|
'size': stat.st_size,
|
||
|
|
'modified': datetime.fromtimestamp(stat.st_mtime),
|
||
|
|
'accessed': datetime.fromtimestamp(stat.st_atime),
|
||
|
|
'created': datetime.fromtimestamp(stat.st_ctime) if hasattr(stat, 'st_birthtime') else None,
|
||
|
|
'directory': str(rel_path.parent),
|
||
|
|
'name': md_file.name
|
||
|
|
})
|
||
|
|
except (OSError, PermissionError) as e:
|
||
|
|
print(f"⚠️ Error accessing {md_file}: {e}")
|
||
|
|
|
||
|
|
def _analyze_dates(self):
|
||
|
|
"""Analyze file modification dates"""
|
||
|
|
now = datetime.now()
|
||
|
|
for file_info in self.files:
|
||
|
|
modified = file_info['modified']
|
||
|
|
days_old = (now - modified).days
|
||
|
|
|
||
|
|
file_info['days_old'] = days_old
|
||
|
|
file_info['age_category'] = (
|
||
|
|
'recent' if days_old < 7 else
|
||
|
|
'recent' if days_old < 30 else
|
||
|
|
'moderate' if days_old < 90 else
|
||
|
|
'old' if days_old < 365 else
|
||
|
|
'very_old'
|
||
|
|
)
|
||
|
|
|
||
|
|
def _identify_patterns(self):
|
||
|
|
"""Identify files by naming patterns"""
|
||
|
|
patterns = {
|
||
|
|
'complete': re.compile(r'COMPLETE', re.I),
|
||
|
|
'final': re.compile(r'FINAL', re.I),
|
||
|
|
'status': re.compile(r'STATUS', re.I),
|
||
|
|
'timestamped': re.compile(r'_\d{8}_\d{6}|\d{8}_\d{6}'),
|
||
|
|
'fix': re.compile(r'FIX|QUICK_FIX|RUN_NOW|EXECUTE', re.I),
|
||
|
|
'report': re.compile(r'REPORT|SUMMARY|ANALYSIS|DIAGNOSTIC', re.I),
|
||
|
|
'temporary': re.compile(r'NOW|READY|EXECUTE|RUN_', re.I)
|
||
|
|
}
|
||
|
|
|
||
|
|
for file_info in self.files:
|
||
|
|
name = file_info['name']
|
||
|
|
for pattern_name, pattern in patterns.items():
|
||
|
|
if pattern.search(name):
|
||
|
|
self.patterns[pattern_name].append(file_info)
|
||
|
|
file_info[f'has_{pattern_name}'] = True
|
||
|
|
|
||
|
|
def _find_misplaced(self):
|
||
|
|
"""Find files in wrong locations"""
|
||
|
|
root_files = [f for f in self.files if f['directory'] == '.']
|
||
|
|
docs_files = [f for f in self.files if f['directory'].startswith('docs')]
|
||
|
|
reports_files = [f for f in self.files if f['directory'].startswith('reports')]
|
||
|
|
|
||
|
|
# Reports in root
|
||
|
|
for f in root_files:
|
||
|
|
if any(keyword in f['name'].upper() for keyword in ['REPORT', 'STATUS', 'INVENTORY', 'DIAGNOSTIC', 'ANALYSIS']):
|
||
|
|
if not f['name'] in ['README.md', 'PROJECT_STRUCTURE.md']:
|
||
|
|
self.misplaced.append({
|
||
|
|
'file': f,
|
||
|
|
'current': 'root',
|
||
|
|
'should_be': 'reports/',
|
||
|
|
'reason': 'Report file in root directory'
|
||
|
|
})
|
||
|
|
|
||
|
|
# Status/completion files in docs
|
||
|
|
for f in docs_files:
|
||
|
|
if any(keyword in f['name'].upper() for keyword in ['COMPLETE', 'FINAL', 'STATUS', 'MIGRATION_COMPLETE']):
|
||
|
|
self.misplaced.append({
|
||
|
|
'file': f,
|
||
|
|
'current': f['directory'],
|
||
|
|
'should_be': 'reports/',
|
||
|
|
'reason': 'Status/completion report in docs directory'
|
||
|
|
})
|
||
|
|
|
||
|
|
# Temporary fix guides in project root
|
||
|
|
for f in root_files:
|
||
|
|
if any(keyword in f['name'].upper() for keyword in ['FIX_', 'QUICK_FIX', 'RUN_NOW', 'EXECUTE']):
|
||
|
|
self.misplaced.append({
|
||
|
|
'file': f,
|
||
|
|
'current': 'root',
|
||
|
|
'should_be': 'docs/09-troubleshooting/archive/',
|
||
|
|
'reason': 'Temporary fix guide in root'
|
||
|
|
})
|
||
|
|
|
||
|
|
def _check_duplicates(self):
|
||
|
|
"""Check for duplicate content"""
|
||
|
|
for file_info in self.files:
|
||
|
|
try:
|
||
|
|
with open(file_info['full_path'], 'rb') as f:
|
||
|
|
content_hash = hashlib.md5(f.read()).hexdigest()
|
||
|
|
|
||
|
|
if content_hash in self.content_hashes:
|
||
|
|
self.duplicates[content_hash].append(file_info)
|
||
|
|
else:
|
||
|
|
self.content_hashes[content_hash] = [file_info]
|
||
|
|
except Exception as e:
|
||
|
|
pass
|
||
|
|
|
||
|
|
def _analyze_content(self):
|
||
|
|
"""Analyze file content for issues"""
|
||
|
|
for file_info in self.files:
|
||
|
|
try:
|
||
|
|
with open(file_info['full_path'], 'r', encoding='utf-8', errors='ignore') as f:
|
||
|
|
content = f.read()
|
||
|
|
lines = content.split('\n')
|
||
|
|
|
||
|
|
file_info['line_count'] = len(lines)
|
||
|
|
file_info['has_todo'] = 'TODO' in content or 'FIXME' in content
|
||
|
|
file_info['has_deprecated'] = 'DEPRECATED' in content or 'OBSOLETE' in content
|
||
|
|
file_info['has_date'] = bool(re.search(r'\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}', content))
|
||
|
|
|
||
|
|
# Check for placeholder dates
|
||
|
|
if re.search(r'\$\(date\)|date \+', content):
|
||
|
|
file_info['has_placeholder_date'] = True
|
||
|
|
except Exception as e:
|
||
|
|
file_info['line_count'] = 0
|
||
|
|
file_info['error'] = str(e)
|
||
|
|
|
||
|
|
def _generate_report(self) -> Dict:
|
||
|
|
"""Generate comprehensive report"""
|
||
|
|
report = {
|
||
|
|
'summary': {
|
||
|
|
'total_files': len(self.files),
|
||
|
|
'total_size_mb': sum(f['size'] for f in self.files) / (1024 * 1024),
|
||
|
|
'by_age': defaultdict(int),
|
||
|
|
'by_directory': defaultdict(int)
|
||
|
|
},
|
||
|
|
'patterns': {},
|
||
|
|
'misplaced': [],
|
||
|
|
'duplicates': [],
|
||
|
|
'old_files': [],
|
||
|
|
'empty_files': [],
|
||
|
|
'issues': []
|
||
|
|
}
|
||
|
|
|
||
|
|
# Summary stats
|
||
|
|
for f in self.files:
|
||
|
|
report['summary']['by_age'][f['age_category']] += 1
|
||
|
|
report['summary']['by_directory'][f['directory']] += 1
|
||
|
|
|
||
|
|
# Pattern counts
|
||
|
|
for pattern_name, files in self.patterns.items():
|
||
|
|
report['patterns'][pattern_name] = {
|
||
|
|
'count': len(files),
|
||
|
|
'files': [f['path'] for f in files[:20]] # Limit to 20
|
||
|
|
}
|
||
|
|
|
||
|
|
# Misplaced files
|
||
|
|
report['misplaced'] = [
|
||
|
|
{
|
||
|
|
'path': m['file']['path'],
|
||
|
|
'current': m['current'],
|
||
|
|
'should_be': m['should_be'],
|
||
|
|
'reason': m['reason']
|
||
|
|
}
|
||
|
|
for m in self.misplaced
|
||
|
|
]
|
||
|
|
|
||
|
|
# Duplicate content
|
||
|
|
for hash_val, files in self.duplicates.items():
|
||
|
|
if len(files) > 1:
|
||
|
|
report['duplicates'].append({
|
||
|
|
'hash': hash_val[:8],
|
||
|
|
'count': len(files),
|
||
|
|
'files': [f['path'] for f in files]
|
||
|
|
})
|
||
|
|
|
||
|
|
# Old files (>90 days)
|
||
|
|
report['old_files'] = [
|
||
|
|
{
|
||
|
|
'path': f['path'],
|
||
|
|
'days_old': f['days_old'],
|
||
|
|
'modified': f['modified'].isoformat()
|
||
|
|
}
|
||
|
|
for f in self.files if f['days_old'] > 90
|
||
|
|
]
|
||
|
|
|
||
|
|
# Empty or very small files
|
||
|
|
report['empty_files'] = [
|
||
|
|
{
|
||
|
|
'path': f['path'],
|
||
|
|
'size': f['size'],
|
||
|
|
'line_count': f.get('line_count', 0)
|
||
|
|
}
|
||
|
|
for f in self.files if f['size'] < 100 or f.get('line_count', 0) < 5
|
||
|
|
]
|
||
|
|
|
||
|
|
# Issues
|
||
|
|
for f in self.files:
|
||
|
|
issues = []
|
||
|
|
if f.get('has_placeholder_date'):
|
||
|
|
issues.append('Contains placeholder date')
|
||
|
|
if f.get('has_deprecated'):
|
||
|
|
issues.append('Marks itself as deprecated')
|
||
|
|
if f['days_old'] > 365:
|
||
|
|
issues.append('Very old (>1 year)')
|
||
|
|
if f['size'] < 50:
|
||
|
|
issues.append('Very small file')
|
||
|
|
|
||
|
|
if issues:
|
||
|
|
report['issues'].append({
|
||
|
|
'path': f['path'],
|
||
|
|
'issues': issues
|
||
|
|
})
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def main():
|
||
|
|
root_dir = Path(__file__).parent.parent
|
||
|
|
analyzer = MarkdownAnalyzer(root_dir)
|
||
|
|
report = analyzer.analyze()
|
||
|
|
|
||
|
|
# Save JSON report
|
||
|
|
json_file = root_dir / 'MARKDOWN_ANALYSIS.json'
|
||
|
|
with open(json_file, 'w') as f:
|
||
|
|
json.dump(report, f, indent=2, default=str)
|
||
|
|
print(f"\n✅ JSON report saved to: {json_file}")
|
||
|
|
|
||
|
|
# Generate markdown report
|
||
|
|
md_file = root_dir / 'MARKDOWN_ANALYSIS_REPORT.md'
|
||
|
|
with open(md_file, 'w') as f:
|
||
|
|
f.write(generate_markdown_report(report))
|
||
|
|
print(f"✅ Markdown report saved to: {md_file}")
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def generate_markdown_report(report: Dict) -> str:
|
||
|
|
"""Generate human-readable markdown report"""
|
||
|
|
md = []
|
||
|
|
md.append("# Markdown Files Analysis Report\n")
|
||
|
|
md.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
md.append("## Summary\n")
|
||
|
|
md.append(f"- **Total Files**: {report['summary']['total_files']}")
|
||
|
|
md.append(f"- **Total Size**: {report['summary']['total_size_mb']:.2f} MB\n")
|
||
|
|
|
||
|
|
md.append("### Files by Age\n")
|
||
|
|
for age, count in sorted(report['summary']['by_age'].items()):
|
||
|
|
md.append(f"- **{age.title()}**: {count}")
|
||
|
|
md.append("")
|
||
|
|
|
||
|
|
# Patterns
|
||
|
|
md.append("## File Patterns\n")
|
||
|
|
for pattern_name, data in report['patterns'].items():
|
||
|
|
md.append(f"### {pattern_name.title()} ({data['count']} files)\n")
|
||
|
|
for file_path in data['files'][:10]:
|
||
|
|
md.append(f"- `{file_path}`")
|
||
|
|
if data['count'] > 10:
|
||
|
|
md.append(f"- ... and {data['count'] - 10} more")
|
||
|
|
md.append("")
|
||
|
|
|
||
|
|
# Misplaced files
|
||
|
|
md.append("## Misplaced Files\n")
|
||
|
|
md.append(f"Found **{len(report['misplaced'])}** misplaced files:\n")
|
||
|
|
for m in report['misplaced'][:50]:
|
||
|
|
md.append(f"- **{m['path']}**")
|
||
|
|
md.append(f" - Current: `{m['current']}`")
|
||
|
|
md.append(f" - Should be: `{m['should_be']}`")
|
||
|
|
md.append(f" - Reason: {m['reason']}\n")
|
||
|
|
|
||
|
|
# Duplicates
|
||
|
|
md.append("## Duplicate Content\n")
|
||
|
|
md.append(f"Found **{len(report['duplicates'])}** sets of duplicate files:\n")
|
||
|
|
for dup in report['duplicates'][:20]:
|
||
|
|
md.append(f"- **{dup['count']} files** with same content:")
|
||
|
|
for file_path in dup['files']:
|
||
|
|
md.append(f" - `{file_path}`")
|
||
|
|
md.append("")
|
||
|
|
|
||
|
|
# Old files
|
||
|
|
md.append("## Old Files (>90 days)\n")
|
||
|
|
md.append(f"Found **{len(report['old_files'])}** old files:\n")
|
||
|
|
for f in sorted(report['old_files'], key=lambda x: x['days_old'], reverse=True)[:50]:
|
||
|
|
md.append(f"- **{f['path']}** ({f['days_old']} days old, modified: {f['modified'][:10]})")
|
||
|
|
md.append("")
|
||
|
|
|
||
|
|
# Issues
|
||
|
|
md.append("## Files with Issues\n")
|
||
|
|
md.append(f"Found **{len(report['issues'])}** files with issues:\n")
|
||
|
|
for issue in report['issues'][:50]:
|
||
|
|
md.append(f"- **{issue['path']}**")
|
||
|
|
for i in issue['issues']:
|
||
|
|
md.append(f" - {i}")
|
||
|
|
md.append("")
|
||
|
|
|
||
|
|
return "\n".join(md)
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|