- Added generated index files and report directories to .gitignore to prevent unnecessary tracking of transient files. - Updated README links to reflect new documentation paths for better navigation. - Improved documentation organization by ensuring all links point to the correct locations, enhancing user experience and accessibility.
265 lines
10 KiB
Python
265 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate Markdown Reference Index
|
|
Creates a comprehensive reference mapping Markdown content to source files and line numbers.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
from collections import defaultdict
|
|
|
|
def extract_headings_with_lines(content: str) -> List[Dict]:
|
|
"""Extract all headings with their line numbers."""
|
|
headings = []
|
|
for line_num, line in enumerate(content.split('\n'), 1):
|
|
match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
if match:
|
|
level = len(match.group(1))
|
|
text = match.group(2).strip()
|
|
headings.append({
|
|
'level': level,
|
|
'text': text,
|
|
'line': line_num
|
|
})
|
|
return headings
|
|
|
|
def extract_code_references(content: str) -> List[Dict]:
|
|
"""Extract code references (file paths, function names, etc.)."""
|
|
references = []
|
|
|
|
# Pattern for code references: file paths, function names, etc.
|
|
patterns = [
|
|
(r'`([^`]+\.(ts|tsx|js|jsx|go|py|sql|yaml|yml|json))`', 'file'),
|
|
(r'`([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\))`', 'function'),
|
|
(r'\[([^\]]+)\]\(([^\)]+)\)', 'link'),
|
|
(r'`([A-Z_][A-Z0-9_]+)`', 'constant'),
|
|
]
|
|
|
|
for line_num, line in enumerate(content.split('\n'), 1):
|
|
for pattern, ref_type in patterns:
|
|
for match in re.finditer(pattern, line):
|
|
if ref_type == 'link':
|
|
references.append({
|
|
'type': ref_type,
|
|
'text': match.group(1),
|
|
'target': match.group(2),
|
|
'line': line_num
|
|
})
|
|
else:
|
|
references.append({
|
|
'type': ref_type,
|
|
'value': match.group(1),
|
|
'line': line_num
|
|
})
|
|
|
|
return references
|
|
|
|
def extract_sections(content: str, headings: List[Dict]) -> List[Dict]:
|
|
"""Extract content sections based on headings."""
|
|
sections = []
|
|
lines = content.split('\n')
|
|
|
|
for i, heading in enumerate(headings):
|
|
start_line = heading['line']
|
|
# Find end of section (next heading of same or higher level, or end of file)
|
|
end_line = len(lines)
|
|
|
|
if i < len(headings) - 1:
|
|
next_heading = headings[i + 1]
|
|
# Only stop at headings of same or higher level
|
|
if next_heading['level'] <= heading['level']:
|
|
end_line = next_heading['line'] - 1
|
|
|
|
section_content = '\n'.join(lines[start_line - 1:end_line])
|
|
|
|
sections.append({
|
|
'heading': heading['text'],
|
|
'level': heading['level'],
|
|
'start_line': start_line,
|
|
'end_line': end_line,
|
|
'line_count': end_line - start_line + 1,
|
|
'content_preview': section_content[:200] + '...' if len(section_content) > 200 else section_content
|
|
})
|
|
|
|
return sections
|
|
|
|
def generate_reference_mapping(index_file: str, output_file: str):
|
|
"""Generate comprehensive reference mapping."""
|
|
|
|
# Load existing index
|
|
with open(index_file, 'r', encoding='utf-8') as f:
|
|
index_data = json.load(f)
|
|
|
|
reference_map = {
|
|
'metadata': {
|
|
'total_files': len(index_data['index']),
|
|
'generated_at': str(Path(__file__).stat().st_mtime)
|
|
},
|
|
'by_file': {},
|
|
'by_heading': defaultdict(list),
|
|
'by_category': defaultdict(list),
|
|
'cross_references': defaultdict(list)
|
|
}
|
|
|
|
# Process each file
|
|
for file_path, file_data in index_data['index'].items():
|
|
file_path_obj = Path(file_path)
|
|
|
|
# Read full content for detailed analysis
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
full_content = f.read()
|
|
except Exception as e:
|
|
print(f"Warning: Could not read {file_path}: {e}")
|
|
continue
|
|
|
|
# Extract detailed information
|
|
headings = extract_headings_with_lines(full_content)
|
|
code_refs = extract_code_references(full_content)
|
|
sections = extract_sections(full_content, headings)
|
|
|
|
# Categorize file
|
|
category = 'other'
|
|
if file_path.startswith('docs/'):
|
|
parts = file_path.split('/')
|
|
if len(parts) > 1:
|
|
if parts[1] in ['api', 'architecture', 'proxmox', 'runbooks', 'status', 'archive']:
|
|
category = parts[1]
|
|
else:
|
|
category = 'docs'
|
|
else:
|
|
category = 'docs'
|
|
elif file_path.startswith('api/'):
|
|
category = 'api'
|
|
elif file_path.startswith('portal/'):
|
|
category = 'portal'
|
|
|
|
# Build file entry
|
|
file_entry = {
|
|
'path': file_path,
|
|
'title': file_data.get('title', ''),
|
|
'category': category,
|
|
'line_count': file_data['line_count'],
|
|
'size_bytes': file_data['size_bytes'],
|
|
'headings': headings,
|
|
'sections': sections,
|
|
'code_references': code_refs,
|
|
'links': file_data.get('links', []),
|
|
'code_blocks': file_data.get('code_blocks', 0)
|
|
}
|
|
|
|
reference_map['by_file'][file_path] = file_entry
|
|
|
|
# Index by heading
|
|
for heading in headings:
|
|
reference_map['by_heading'][heading['text'].lower()].append({
|
|
'file': file_path,
|
|
'line': heading['line'],
|
|
'level': heading['level']
|
|
})
|
|
|
|
# Index by category
|
|
reference_map['by_category'][category].append(file_path)
|
|
|
|
# Extract cross-references (links to other markdown files)
|
|
for link in file_data.get('links', []):
|
|
link_target = link.get('url', '')
|
|
if link_target.endswith('.md') or link_target.endswith('.md#'):
|
|
# Normalize link target
|
|
if link_target.startswith('./'):
|
|
link_target = str(file_path_obj.parent / link_target[2:])
|
|
elif link_target.startswith('../'):
|
|
link_target = str(file_path_obj.parent.parent / link_target[3:])
|
|
|
|
reference_map['cross_references'][file_path].append({
|
|
'target': link_target,
|
|
'text': link.get('text', ''),
|
|
'line': link.get('line', 0)
|
|
})
|
|
|
|
# Save reference mapping
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(reference_map, f, indent=2, ensure_ascii=False)
|
|
|
|
# Generate human-readable report
|
|
report_file = output_file.replace('.json', '.md')
|
|
generate_markdown_report(reference_map, report_file)
|
|
|
|
print(f"Reference mapping saved to: {output_file}")
|
|
print(f"Human-readable report saved to: {report_file}")
|
|
|
|
return reference_map
|
|
|
|
def generate_markdown_report(reference_map: Dict, output_file: str):
|
|
"""Generate human-readable Markdown report."""
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Markdown Reference Index\n\n")
|
|
f.write(f"**Generated**: {reference_map['metadata']['generated_at']}\n")
|
|
f.write(f"**Total Files**: {reference_map['metadata']['total_files']}\n\n")
|
|
f.write("---\n\n")
|
|
|
|
# Files by category
|
|
f.write("## Files by Category\n\n")
|
|
for category in sorted(reference_map['by_category'].keys()):
|
|
files = reference_map['by_category'][category]
|
|
f.write(f"### {category} ({len(files)} files)\n\n")
|
|
for file_path in sorted(files)[:20]:
|
|
file_entry = reference_map['by_file'][file_path]
|
|
f.write(f"- [{file_entry['title'] or file_path}](./{file_path}) - {file_entry['line_count']} lines\n")
|
|
if len(files) > 20:
|
|
f.write(f" *... and {len(files) - 20} more files*\n")
|
|
f.write("\n")
|
|
|
|
# Heading index
|
|
f.write("## Heading Index\n\n")
|
|
f.write("*Top 50 most common headings*\n\n")
|
|
heading_counts = [(h, len(refs)) for h, refs in reference_map['by_heading'].items()]
|
|
heading_counts.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
for heading, count in heading_counts[:50]:
|
|
refs = reference_map['by_heading'][heading]
|
|
f.write(f"### {heading} ({count} occurrences)\n\n")
|
|
for ref in refs[:5]:
|
|
f.write(f"- Line {ref['line']}: [{ref['file']}](./{ref['file']}#{heading.lower().replace(' ', '-')})\n")
|
|
if len(refs) > 5:
|
|
f.write(f" *... and {len(refs) - 5} more occurrences*\n")
|
|
f.write("\n")
|
|
|
|
# File details
|
|
f.write("## File Details\n\n")
|
|
f.write("*Files with headings and line numbers*\n\n")
|
|
|
|
for file_path in sorted(reference_map['by_file'].keys())[:30]:
|
|
file_entry = reference_map['by_file'][file_path]
|
|
f.write(f"### {file_path}\n\n")
|
|
f.write(f"**Title**: {file_entry['title'] or 'N/A'}\n")
|
|
f.write(f"**Lines**: {file_entry['line_count']}\n")
|
|
f.write(f"**Headings**: {len(file_entry['headings'])}\n\n")
|
|
|
|
if file_entry['headings']:
|
|
f.write("**Headings**:\n")
|
|
for heading in file_entry['headings'][:10]:
|
|
indent = ' ' * (heading['level'] - 1)
|
|
f.write(f"{indent}- Line {heading['line']}: {heading['text']}\n")
|
|
if len(file_entry['headings']) > 10:
|
|
f.write(f" *... and {len(file_entry['headings']) - 10} more headings*\n")
|
|
f.write("\n")
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
index_file = 'docs/MARKDOWN_INDEX.json'
|
|
output_file = 'docs/MARKDOWN_REFERENCE.json'
|
|
|
|
if len(sys.argv) > 1:
|
|
index_file = sys.argv[1]
|
|
if len(sys.argv) > 2:
|
|
output_file = sys.argv[2]
|
|
|
|
reference_map = generate_reference_mapping(index_file, output_file)
|
|
print("\nReference mapping generation complete!")
|
|
|