#!/usr/bin/env python3 """ Comprehensive Markdown File Analysis Script Analyzes all markdown files in the project for: - File dates (creation, modification) - Duplicate patterns - Misplaced files - Content inconsistencies - Outdated information """ import os import re import json from pathlib import Path from datetime import datetime from collections import defaultdict from typing import Dict, List, Tuple, Set import hashlib class MarkdownAnalyzer: def __init__(self, root_dir: str): self.root_dir = Path(root_dir) self.files = [] self.duplicates = defaultdict(list) self.patterns = { 'complete': [], 'final': [], 'status': [], 'timestamped': [], 'fix': [], 'report': [], 'temporary': [] } self.misplaced = [] self.content_hashes = {} self.file_metadata = [] def analyze(self): """Run full analysis""" print("šŸ” Scanning markdown files...") self._scan_files() print(f"šŸ“Š Found {len(self.files)} markdown files") print("\nšŸ“… Analyzing file dates...") self._analyze_dates() print("\nšŸ”Ž Identifying patterns...") self._identify_patterns() print("\nšŸ“ Finding misplaced files...") self._find_misplaced() print("\nšŸ”— Checking for duplicates...") self._check_duplicates() print("\nšŸ“ Analyzing content...") self._analyze_content() return self._generate_report() def _scan_files(self): """Scan for all markdown files""" exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv', '.venv'} for md_file in self.root_dir.rglob('*.md'): # Skip excluded directories if any(part in exclude_dirs for part in md_file.parts): continue try: stat = md_file.stat() rel_path = md_file.relative_to(self.root_dir) self.files.append({ 'path': str(rel_path), 'full_path': str(md_file), 'size': stat.st_size, 'modified': datetime.fromtimestamp(stat.st_mtime), 'accessed': datetime.fromtimestamp(stat.st_atime), 'created': datetime.fromtimestamp(stat.st_ctime) if hasattr(stat, 'st_birthtime') else None, 'directory': str(rel_path.parent), 'name': md_file.name }) except (OSError, PermissionError) as e: print(f"āš ļø Error accessing {md_file}: {e}") def _analyze_dates(self): """Analyze file modification dates""" now = datetime.now() for file_info in self.files: modified = file_info['modified'] days_old = (now - modified).days file_info['days_old'] = days_old file_info['age_category'] = ( 'recent' if days_old < 7 else 'recent' if days_old < 30 else 'moderate' if days_old < 90 else 'old' if days_old < 365 else 'very_old' ) def _identify_patterns(self): """Identify files by naming patterns""" patterns = { 'complete': re.compile(r'COMPLETE', re.I), 'final': re.compile(r'FINAL', re.I), 'status': re.compile(r'STATUS', re.I), 'timestamped': re.compile(r'_\d{8}_\d{6}|\d{8}_\d{6}'), 'fix': re.compile(r'FIX|QUICK_FIX|RUN_NOW|EXECUTE', re.I), 'report': re.compile(r'REPORT|SUMMARY|ANALYSIS|DIAGNOSTIC', re.I), 'temporary': re.compile(r'NOW|READY|EXECUTE|RUN_', re.I) } for file_info in self.files: name = file_info['name'] for pattern_name, pattern in patterns.items(): if pattern.search(name): self.patterns[pattern_name].append(file_info) file_info[f'has_{pattern_name}'] = True def _find_misplaced(self): """Find files in wrong locations""" root_files = [f for f in self.files if f['directory'] == '.'] docs_files = [f for f in self.files if f['directory'].startswith('docs')] reports_files = [f for f in self.files if f['directory'].startswith('reports')] # Reports in root for f in root_files: if any(keyword in f['name'].upper() for keyword in ['REPORT', 'STATUS', 'INVENTORY', 'DIAGNOSTIC', 'ANALYSIS']): if not f['name'] in ['README.md', 'PROJECT_STRUCTURE.md']: self.misplaced.append({ 'file': f, 'current': 'root', 'should_be': 'reports/', 'reason': 'Report file in root directory' }) # Status/completion files in docs for f in docs_files: if any(keyword in f['name'].upper() for keyword in ['COMPLETE', 'FINAL', 'STATUS', 'MIGRATION_COMPLETE']): self.misplaced.append({ 'file': f, 'current': f['directory'], 'should_be': 'reports/', 'reason': 'Status/completion report in docs directory' }) # Temporary fix guides in project root for f in root_files: if any(keyword in f['name'].upper() for keyword in ['FIX_', 'QUICK_FIX', 'RUN_NOW', 'EXECUTE']): self.misplaced.append({ 'file': f, 'current': 'root', 'should_be': 'docs/09-troubleshooting/archive/', 'reason': 'Temporary fix guide in root' }) def _check_duplicates(self): """Check for duplicate content""" for file_info in self.files: try: with open(file_info['full_path'], 'rb') as f: content_hash = hashlib.md5(f.read()).hexdigest() if content_hash in self.content_hashes: self.duplicates[content_hash].append(file_info) else: self.content_hashes[content_hash] = [file_info] except Exception as e: pass def _analyze_content(self): """Analyze file content for issues""" for file_info in self.files: try: with open(file_info['full_path'], 'r', encoding='utf-8', errors='ignore') as f: content = f.read() lines = content.split('\n') file_info['line_count'] = len(lines) file_info['has_todo'] = 'TODO' in content or 'FIXME' in content file_info['has_deprecated'] = 'DEPRECATED' in content or 'OBSOLETE' in content file_info['has_date'] = bool(re.search(r'\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}', content)) # Check for placeholder dates if re.search(r'\$\(date\)|date \+', content): file_info['has_placeholder_date'] = True except Exception as e: file_info['line_count'] = 0 file_info['error'] = str(e) def _generate_report(self) -> Dict: """Generate comprehensive report""" report = { 'summary': { 'total_files': len(self.files), 'total_size_mb': sum(f['size'] for f in self.files) / (1024 * 1024), 'by_age': defaultdict(int), 'by_directory': defaultdict(int) }, 'patterns': {}, 'misplaced': [], 'duplicates': [], 'old_files': [], 'empty_files': [], 'issues': [] } # Summary stats for f in self.files: report['summary']['by_age'][f['age_category']] += 1 report['summary']['by_directory'][f['directory']] += 1 # Pattern counts for pattern_name, files in self.patterns.items(): report['patterns'][pattern_name] = { 'count': len(files), 'files': [f['path'] for f in files[:20]] # Limit to 20 } # Misplaced files report['misplaced'] = [ { 'path': m['file']['path'], 'current': m['current'], 'should_be': m['should_be'], 'reason': m['reason'] } for m in self.misplaced ] # Duplicate content for hash_val, files in self.duplicates.items(): if len(files) > 1: report['duplicates'].append({ 'hash': hash_val[:8], 'count': len(files), 'files': [f['path'] for f in files] }) # Old files (>90 days) report['old_files'] = [ { 'path': f['path'], 'days_old': f['days_old'], 'modified': f['modified'].isoformat() } for f in self.files if f['days_old'] > 90 ] # Empty or very small files report['empty_files'] = [ { 'path': f['path'], 'size': f['size'], 'line_count': f.get('line_count', 0) } for f in self.files if f['size'] < 100 or f.get('line_count', 0) < 5 ] # Issues for f in self.files: issues = [] if f.get('has_placeholder_date'): issues.append('Contains placeholder date') if f.get('has_deprecated'): issues.append('Marks itself as deprecated') if f['days_old'] > 365: issues.append('Very old (>1 year)') if f['size'] < 50: issues.append('Very small file') if issues: report['issues'].append({ 'path': f['path'], 'issues': issues }) return report def main(): root_dir = Path(__file__).parent.parent analyzer = MarkdownAnalyzer(root_dir) report = analyzer.analyze() # Save JSON report json_file = root_dir / 'MARKDOWN_ANALYSIS.json' with open(json_file, 'w') as f: json.dump(report, f, indent=2, default=str) print(f"\nāœ… JSON report saved to: {json_file}") # Generate markdown report md_file = root_dir / 'MARKDOWN_ANALYSIS_REPORT.md' with open(md_file, 'w') as f: f.write(generate_markdown_report(report)) print(f"āœ… Markdown report saved to: {md_file}") return report def generate_markdown_report(report: Dict) -> str: """Generate human-readable markdown report""" md = [] md.append("# Markdown Files Analysis Report\n") md.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") # Summary md.append("## Summary\n") md.append(f"- **Total Files**: {report['summary']['total_files']}") md.append(f"- **Total Size**: {report['summary']['total_size_mb']:.2f} MB\n") md.append("### Files by Age\n") for age, count in sorted(report['summary']['by_age'].items()): md.append(f"- **{age.title()}**: {count}") md.append("") # Patterns md.append("## File Patterns\n") for pattern_name, data in report['patterns'].items(): md.append(f"### {pattern_name.title()} ({data['count']} files)\n") for file_path in data['files'][:10]: md.append(f"- `{file_path}`") if data['count'] > 10: md.append(f"- ... and {data['count'] - 10} more") md.append("") # Misplaced files md.append("## Misplaced Files\n") md.append(f"Found **{len(report['misplaced'])}** misplaced files:\n") for m in report['misplaced'][:50]: md.append(f"- **{m['path']}**") md.append(f" - Current: `{m['current']}`") md.append(f" - Should be: `{m['should_be']}`") md.append(f" - Reason: {m['reason']}\n") # Duplicates md.append("## Duplicate Content\n") md.append(f"Found **{len(report['duplicates'])}** sets of duplicate files:\n") for dup in report['duplicates'][:20]: md.append(f"- **{dup['count']} files** with same content:") for file_path in dup['files']: md.append(f" - `{file_path}`") md.append("") # Old files md.append("## Old Files (>90 days)\n") md.append(f"Found **{len(report['old_files'])}** old files:\n") for f in sorted(report['old_files'], key=lambda x: x['days_old'], reverse=True)[:50]: md.append(f"- **{f['path']}** ({f['days_old']} days old, modified: {f['modified'][:10]})") md.append("") # Issues md.append("## Files with Issues\n") md.append(f"Found **{len(report['issues'])}** files with issues:\n") for issue in report['issues'][:50]: md.append(f"- **{issue['path']}**") for i in issue['issues']: md.append(f" - {i}") md.append("") return "\n".join(md) if __name__ == '__main__': main()