Files
Sankofa/scripts/generate-markdown-reference.py
defiQUG fe0365757a Update documentation structure and enhance .gitignore
- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files.
- Updated README links to reflect new documentation paths for better navigation.
- Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
2025-12-12 21:18:55 -08:00

265 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Generate Markdown Reference Index
Creates a comprehensive reference mapping Markdown content to source files and line numbers.
"""
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict
def extract_headings_with_lines(content: str) -> List[Dict]:
"""Extract all headings with their line numbers."""
headings = []
for line_num, line in enumerate(content.split('\n'), 1):
match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
if match:
level = len(match.group(1))
text = match.group(2).strip()
headings.append({
'level': level,
'text': text,
'line': line_num
})
return headings
def extract_code_references(content: str) -> List[Dict]:
"""Extract code references (file paths, function names, etc.)."""
references = []
# Pattern for code references: file paths, function names, etc.
patterns = [
(r'`([^`]+\.(ts|tsx|js|jsx|go|py|sql|yaml|yml|json))`', 'file'),
(r'`([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\))`', 'function'),
(r'\[([^\]]+)\]\(([^\)]+)\)', 'link'),
(r'`([A-Z_][A-Z0-9_]+)`', 'constant'),
]
for line_num, line in enumerate(content.split('\n'), 1):
for pattern, ref_type in patterns:
for match in re.finditer(pattern, line):
if ref_type == 'link':
references.append({
'type': ref_type,
'text': match.group(1),
'target': match.group(2),
'line': line_num
})
else:
references.append({
'type': ref_type,
'value': match.group(1),
'line': line_num
})
return references
def extract_sections(content: str, headings: List[Dict]) -> List[Dict]:
"""Extract content sections based on headings."""
sections = []
lines = content.split('\n')
for i, heading in enumerate(headings):
start_line = heading['line']
# Find end of section (next heading of same or higher level, or end of file)
end_line = len(lines)
if i < len(headings) - 1:
next_heading = headings[i + 1]
# Only stop at headings of same or higher level
if next_heading['level'] <= heading['level']:
end_line = next_heading['line'] - 1
section_content = '\n'.join(lines[start_line - 1:end_line])
sections.append({
'heading': heading['text'],
'level': heading['level'],
'start_line': start_line,
'end_line': end_line,
'line_count': end_line - start_line + 1,
'content_preview': section_content[:200] + '...' if len(section_content) > 200 else section_content
})
return sections
def generate_reference_mapping(index_file: str, output_file: str):
"""Generate comprehensive reference mapping."""
# Load existing index
with open(index_file, 'r', encoding='utf-8') as f:
index_data = json.load(f)
reference_map = {
'metadata': {
'total_files': len(index_data['index']),
'generated_at': str(Path(__file__).stat().st_mtime)
},
'by_file': {},
'by_heading': defaultdict(list),
'by_category': defaultdict(list),
'cross_references': defaultdict(list)
}
# Process each file
for file_path, file_data in index_data['index'].items():
file_path_obj = Path(file_path)
# Read full content for detailed analysis
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
full_content = f.read()
except Exception as e:
print(f"Warning: Could not read {file_path}: {e}")
continue
# Extract detailed information
headings = extract_headings_with_lines(full_content)
code_refs = extract_code_references(full_content)
sections = extract_sections(full_content, headings)
# Categorize file
category = 'other'
if file_path.startswith('docs/'):
parts = file_path.split('/')
if len(parts) > 1:
if parts[1] in ['api', 'architecture', 'proxmox', 'runbooks', 'status', 'archive']:
category = parts[1]
else:
category = 'docs'
else:
category = 'docs'
elif file_path.startswith('api/'):
category = 'api'
elif file_path.startswith('portal/'):
category = 'portal'
# Build file entry
file_entry = {
'path': file_path,
'title': file_data.get('title', ''),
'category': category,
'line_count': file_data['line_count'],
'size_bytes': file_data['size_bytes'],
'headings': headings,
'sections': sections,
'code_references': code_refs,
'links': file_data.get('links', []),
'code_blocks': file_data.get('code_blocks', 0)
}
reference_map['by_file'][file_path] = file_entry
# Index by heading
for heading in headings:
reference_map['by_heading'][heading['text'].lower()].append({
'file': file_path,
'line': heading['line'],
'level': heading['level']
})
# Index by category
reference_map['by_category'][category].append(file_path)
# Extract cross-references (links to other markdown files)
for link in file_data.get('links', []):
link_target = link.get('url', '')
if link_target.endswith('.md') or link_target.endswith('.md#'):
# Normalize link target
if link_target.startswith('./'):
link_target = str(file_path_obj.parent / link_target[2:])
elif link_target.startswith('../'):
link_target = str(file_path_obj.parent.parent / link_target[3:])
reference_map['cross_references'][file_path].append({
'target': link_target,
'text': link.get('text', ''),
'line': link.get('line', 0)
})
# Save reference mapping
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(reference_map, f, indent=2, ensure_ascii=False)
# Generate human-readable report
report_file = output_file.replace('.json', '.md')
generate_markdown_report(reference_map, report_file)
print(f"Reference mapping saved to: {output_file}")
print(f"Human-readable report saved to: {report_file}")
return reference_map
def generate_markdown_report(reference_map: Dict, output_file: str):
"""Generate human-readable Markdown report."""
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Markdown Reference Index\n\n")
f.write(f"**Generated**: {reference_map['metadata']['generated_at']}\n")
f.write(f"**Total Files**: {reference_map['metadata']['total_files']}\n\n")
f.write("---\n\n")
# Files by category
f.write("## Files by Category\n\n")
for category in sorted(reference_map['by_category'].keys()):
files = reference_map['by_category'][category]
f.write(f"### {category} ({len(files)} files)\n\n")
for file_path in sorted(files)[:20]:
file_entry = reference_map['by_file'][file_path]
f.write(f"- [{file_entry['title'] or file_path}](./{file_path}) - {file_entry['line_count']} lines\n")
if len(files) > 20:
f.write(f" *... and {len(files) - 20} more files*\n")
f.write("\n")
# Heading index
f.write("## Heading Index\n\n")
f.write("*Top 50 most common headings*\n\n")
heading_counts = [(h, len(refs)) for h, refs in reference_map['by_heading'].items()]
heading_counts.sort(key=lambda x: x[1], reverse=True)
for heading, count in heading_counts[:50]:
refs = reference_map['by_heading'][heading]
f.write(f"### {heading} ({count} occurrences)\n\n")
for ref in refs[:5]:
f.write(f"- Line {ref['line']}: [{ref['file']}](./{ref['file']}#{heading.lower().replace(' ', '-')})\n")
if len(refs) > 5:
f.write(f" *... and {len(refs) - 5} more occurrences*\n")
f.write("\n")
# File details
f.write("## File Details\n\n")
f.write("*Files with headings and line numbers*\n\n")
for file_path in sorted(reference_map['by_file'].keys())[:30]:
file_entry = reference_map['by_file'][file_path]
f.write(f"### {file_path}\n\n")
f.write(f"**Title**: {file_entry['title'] or 'N/A'}\n")
f.write(f"**Lines**: {file_entry['line_count']}\n")
f.write(f"**Headings**: {len(file_entry['headings'])}\n\n")
if file_entry['headings']:
f.write("**Headings**:\n")
for heading in file_entry['headings'][:10]:
indent = ' ' * (heading['level'] - 1)
f.write(f"{indent}- Line {heading['line']}: {heading['text']}\n")
if len(file_entry['headings']) > 10:
f.write(f" *... and {len(file_entry['headings']) - 10} more headings*\n")
f.write("\n")
if __name__ == '__main__':
import sys
index_file = 'docs/MARKDOWN_INDEX.json'
output_file = 'docs/MARKDOWN_REFERENCE.json'
if len(sys.argv) > 1:
index_file = sys.argv[1]
if len(sys.argv) > 2:
output_file = sys.argv[2]
reference_map = generate_reference_mapping(index_file, output_file)
print("\nReference mapping generation complete!")