Files
Sankofa/scripts/analyze-markdown.py

214 lines
7.7 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
Markdown Analysis Script
Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers.
"""
import os
import hashlib
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Set
import json
class MarkdownAnalyzer:
def __init__(self, root_dir: str = '.'):
self.root_dir = Path(root_dir)
self.md_files: List[Path] = []
self.content_index: Dict[str, Dict] = {}
self.duplicates: Dict[str, List[str]] = defaultdict(list)
self.file_structure: Dict[str, List[str]] = defaultdict(list)
def find_all_markdown(self):
"""Find all markdown files in the project."""
for md_file in self.root_dir.rglob('*.md'):
# Skip node_modules, .git, and other ignored directories
parts = md_file.parts
if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']):
continue
self.md_files.append(md_file)
def analyze_duplicates(self):
"""Find duplicate files by content hash."""
content_hashes = defaultdict(list)
for md_file in self.md_files:
try:
with open(md_file, 'rb') as f:
content = f.read()
content_hash = hashlib.md5(content).hexdigest()
rel_path = str(md_file.relative_to(self.root_dir))
content_hashes[content_hash].append(rel_path)
except Exception as e:
print(f"Error reading {md_file}: {e}")
# Find duplicates
for content_hash, files in content_hashes.items():
if len(files) > 1:
self.duplicates[content_hash] = files
def index_content(self):
"""Create detailed index of markdown content with line numbers."""
for md_file in self.md_files:
rel_path = str(md_file.relative_to(self.root_dir))
try:
with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
# Extract metadata
title = None
headings = []
code_blocks = []
links = []
for line_num, line in enumerate(lines, 1):
# Find title (first H1)
if not title and line.strip().startswith('# '):
title = line.strip()[2:].strip()
# Find all headings
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
if heading_match:
level = len(heading_match.group(1))
heading_text = heading_match.group(2).strip()
headings.append({
'level': level,
'text': heading_text,
'line': line_num
})
# Find code blocks
if line.strip().startswith('```'):
code_blocks.append({
'line': line_num,
'type': 'code_block'
})
# Find links
link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
for match in re.finditer(link_pattern, line):
links.append({
'text': match.group(1),
'url': match.group(2),
'line': line_num
})
self.content_index[rel_path] = {
'path': rel_path,
'title': title,
'line_count': len(lines),
'headings': headings,
'code_blocks': len(code_blocks),
'links': links,
'size_bytes': md_file.stat().st_size
}
except Exception as e:
print(f"Error indexing {md_file}: {e}")
def categorize_files(self):
"""Categorize files by location."""
for md_file in self.md_files:
rel_path = str(md_file.relative_to(self.root_dir))
parts = rel_path.split('/')
if len(parts) == 1:
category = 'root'
elif parts[0] == 'docs':
if len(parts) > 1:
category = f"docs/{parts[1]}"
else:
category = 'docs'
elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']:
category = parts[0]
else:
category = 'other'
self.file_structure[category].append(rel_path)
def generate_report(self) -> Dict:
"""Generate comprehensive analysis report."""
return {
'total_files': len(self.md_files),
'unique_files': len(self.content_index),
'duplicate_groups': len(self.duplicates),
'duplicates': dict(self.duplicates),
'categories': {k: len(v) for k, v in self.file_structure.items()},
'index': self.content_index
}
def find_similar_content(self) -> Dict[str, List[str]]:
"""Find files with similar titles (potential duplicates)."""
similar = defaultdict(list)
for rel_path, data in self.content_index.items():
if data['title']:
title_key = data['title'].lower().strip()
similar[title_key].append(rel_path)
return {k: v for k, v in similar.items() if len(v) > 1}
def main():
analyzer = MarkdownAnalyzer('.')
print("Finding all Markdown files...")
analyzer.find_all_markdown()
print(f"Found {len(analyzer.md_files)} Markdown files\n")
print("Analyzing duplicates...")
analyzer.analyze_duplicates()
print(f"Found {len(analyzer.duplicates)} duplicate groups\n")
print("Indexing content...")
analyzer.index_content()
print(f"Indexed {len(analyzer.content_index)} files\n")
print("Categorizing files...")
analyzer.categorize_files()
print("Finding similar content...")
similar = analyzer.find_similar_content()
# Generate report
report = analyzer.generate_report()
# Print summary
print("\n" + "="*60)
print("MARKDOWN ANALYSIS SUMMARY")
print("="*60)
print(f"Total Markdown files: {report['total_files']}")
print(f"Unique files: {report['unique_files']}")
print(f"Duplicate groups: {report['duplicate_groups']}")
if report['duplicate_groups'] > 0:
print("\nDuplicate files:")
for hash_val, files in list(report['duplicates'].items())[:10]:
print(f"\n Hash: {hash_val[:16]}... ({len(files)} files)")
for f in files:
print(f" - {f}")
print(f"\nSimilar titles (potential duplicates): {len(similar)}")
for title, files in list(similar.items())[:10]:
print(f"\n '{title}':")
for f in files:
print(f" - {f}")
print("\nFiles by category:")
for category, count in sorted(report['categories'].items()):
print(f" {category}: {count} files")
# Save detailed report
output_file = 'docs/MARKDOWN_INDEX.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\nDetailed index saved to: {output_file}")
return analyzer, report
if __name__ == '__main__':
analyzer, report = main()