- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
214 lines
7.7 KiB
Python
214 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Markdown Analysis Script
|
|
Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers.
|
|
"""
|
|
|
|
import os
|
|
import hashlib
|
|
import re
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple, Set
|
|
import json
|
|
|
|
class MarkdownAnalyzer:
|
|
def __init__(self, root_dir: str = '.'):
|
|
self.root_dir = Path(root_dir)
|
|
self.md_files: List[Path] = []
|
|
self.content_index: Dict[str, Dict] = {}
|
|
self.duplicates: Dict[str, List[str]] = defaultdict(list)
|
|
self.file_structure: Dict[str, List[str]] = defaultdict(list)
|
|
|
|
def find_all_markdown(self):
|
|
"""Find all markdown files in the project."""
|
|
for md_file in self.root_dir.rglob('*.md'):
|
|
# Skip node_modules, .git, and other ignored directories
|
|
parts = md_file.parts
|
|
if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']):
|
|
continue
|
|
self.md_files.append(md_file)
|
|
|
|
def analyze_duplicates(self):
|
|
"""Find duplicate files by content hash."""
|
|
content_hashes = defaultdict(list)
|
|
|
|
for md_file in self.md_files:
|
|
try:
|
|
with open(md_file, 'rb') as f:
|
|
content = f.read()
|
|
content_hash = hashlib.md5(content).hexdigest()
|
|
rel_path = str(md_file.relative_to(self.root_dir))
|
|
content_hashes[content_hash].append(rel_path)
|
|
except Exception as e:
|
|
print(f"Error reading {md_file}: {e}")
|
|
|
|
# Find duplicates
|
|
for content_hash, files in content_hashes.items():
|
|
if len(files) > 1:
|
|
self.duplicates[content_hash] = files
|
|
|
|
def index_content(self):
|
|
"""Create detailed index of markdown content with line numbers."""
|
|
for md_file in self.md_files:
|
|
rel_path = str(md_file.relative_to(self.root_dir))
|
|
|
|
try:
|
|
with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = f.readlines()
|
|
|
|
# Extract metadata
|
|
title = None
|
|
headings = []
|
|
code_blocks = []
|
|
links = []
|
|
|
|
for line_num, line in enumerate(lines, 1):
|
|
# Find title (first H1)
|
|
if not title and line.strip().startswith('# '):
|
|
title = line.strip()[2:].strip()
|
|
|
|
# Find all headings
|
|
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
heading_text = heading_match.group(2).strip()
|
|
headings.append({
|
|
'level': level,
|
|
'text': heading_text,
|
|
'line': line_num
|
|
})
|
|
|
|
# Find code blocks
|
|
if line.strip().startswith('```'):
|
|
code_blocks.append({
|
|
'line': line_num,
|
|
'type': 'code_block'
|
|
})
|
|
|
|
# Find links
|
|
link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
|
|
for match in re.finditer(link_pattern, line):
|
|
links.append({
|
|
'text': match.group(1),
|
|
'url': match.group(2),
|
|
'line': line_num
|
|
})
|
|
|
|
self.content_index[rel_path] = {
|
|
'path': rel_path,
|
|
'title': title,
|
|
'line_count': len(lines),
|
|
'headings': headings,
|
|
'code_blocks': len(code_blocks),
|
|
'links': links,
|
|
'size_bytes': md_file.stat().st_size
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error indexing {md_file}: {e}")
|
|
|
|
def categorize_files(self):
|
|
"""Categorize files by location."""
|
|
for md_file in self.md_files:
|
|
rel_path = str(md_file.relative_to(self.root_dir))
|
|
parts = rel_path.split('/')
|
|
|
|
if len(parts) == 1:
|
|
category = 'root'
|
|
elif parts[0] == 'docs':
|
|
if len(parts) > 1:
|
|
category = f"docs/{parts[1]}"
|
|
else:
|
|
category = 'docs'
|
|
elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']:
|
|
category = parts[0]
|
|
else:
|
|
category = 'other'
|
|
|
|
self.file_structure[category].append(rel_path)
|
|
|
|
def generate_report(self) -> Dict:
|
|
"""Generate comprehensive analysis report."""
|
|
return {
|
|
'total_files': len(self.md_files),
|
|
'unique_files': len(self.content_index),
|
|
'duplicate_groups': len(self.duplicates),
|
|
'duplicates': dict(self.duplicates),
|
|
'categories': {k: len(v) for k, v in self.file_structure.items()},
|
|
'index': self.content_index
|
|
}
|
|
|
|
def find_similar_content(self) -> Dict[str, List[str]]:
|
|
"""Find files with similar titles (potential duplicates)."""
|
|
similar = defaultdict(list)
|
|
|
|
for rel_path, data in self.content_index.items():
|
|
if data['title']:
|
|
title_key = data['title'].lower().strip()
|
|
similar[title_key].append(rel_path)
|
|
|
|
return {k: v for k, v in similar.items() if len(v) > 1}
|
|
|
|
def main():
|
|
analyzer = MarkdownAnalyzer('.')
|
|
|
|
print("Finding all Markdown files...")
|
|
analyzer.find_all_markdown()
|
|
print(f"Found {len(analyzer.md_files)} Markdown files\n")
|
|
|
|
print("Analyzing duplicates...")
|
|
analyzer.analyze_duplicates()
|
|
print(f"Found {len(analyzer.duplicates)} duplicate groups\n")
|
|
|
|
print("Indexing content...")
|
|
analyzer.index_content()
|
|
print(f"Indexed {len(analyzer.content_index)} files\n")
|
|
|
|
print("Categorizing files...")
|
|
analyzer.categorize_files()
|
|
|
|
print("Finding similar content...")
|
|
similar = analyzer.find_similar_content()
|
|
|
|
# Generate report
|
|
report = analyzer.generate_report()
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("MARKDOWN ANALYSIS SUMMARY")
|
|
print("="*60)
|
|
print(f"Total Markdown files: {report['total_files']}")
|
|
print(f"Unique files: {report['unique_files']}")
|
|
print(f"Duplicate groups: {report['duplicate_groups']}")
|
|
|
|
if report['duplicate_groups'] > 0:
|
|
print("\nDuplicate files:")
|
|
for hash_val, files in list(report['duplicates'].items())[:10]:
|
|
print(f"\n Hash: {hash_val[:16]}... ({len(files)} files)")
|
|
for f in files:
|
|
print(f" - {f}")
|
|
|
|
print(f"\nSimilar titles (potential duplicates): {len(similar)}")
|
|
for title, files in list(similar.items())[:10]:
|
|
print(f"\n '{title}':")
|
|
for f in files:
|
|
print(f" - {f}")
|
|
|
|
print("\nFiles by category:")
|
|
for category, count in sorted(report['categories'].items()):
|
|
print(f" {category}: {count} files")
|
|
|
|
# Save detailed report
|
|
output_file = 'docs/MARKDOWN_INDEX.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\nDetailed index saved to: {output_file}")
|
|
|
|
return analyzer, report
|
|
|
|
if __name__ == '__main__':
|
|
analyzer, report = main()
|
|
|