#!/usr/bin/env python3 """ Generate Markdown Reference Index Creates a comprehensive reference mapping Markdown content to source files and line numbers. """ import json import re from pathlib import Path from typing import Dict, List, Tuple from collections import defaultdict def extract_headings_with_lines(content: str) -> List[Dict]: """Extract all headings with their line numbers.""" headings = [] for line_num, line in enumerate(content.split('\n'), 1): match = re.match(r'^(#{1,6})\s+(.+)$', line.strip()) if match: level = len(match.group(1)) text = match.group(2).strip() headings.append({ 'level': level, 'text': text, 'line': line_num }) return headings def extract_code_references(content: str) -> List[Dict]: """Extract code references (file paths, function names, etc.).""" references = [] # Pattern for code references: file paths, function names, etc. patterns = [ (r'`([^`]+\.(ts|tsx|js|jsx|go|py|sql|yaml|yml|json))`', 'file'), (r'`([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\))`', 'function'), (r'\[([^\]]+)\]\(([^\)]+)\)', 'link'), (r'`([A-Z_][A-Z0-9_]+)`', 'constant'), ] for line_num, line in enumerate(content.split('\n'), 1): for pattern, ref_type in patterns: for match in re.finditer(pattern, line): if ref_type == 'link': references.append({ 'type': ref_type, 'text': match.group(1), 'target': match.group(2), 'line': line_num }) else: references.append({ 'type': ref_type, 'value': match.group(1), 'line': line_num }) return references def extract_sections(content: str, headings: List[Dict]) -> List[Dict]: """Extract content sections based on headings.""" sections = [] lines = content.split('\n') for i, heading in enumerate(headings): start_line = heading['line'] # Find end of section (next heading of same or higher level, or end of file) end_line = len(lines) if i < len(headings) - 1: next_heading = headings[i + 1] # Only stop at headings of same or higher level if next_heading['level'] <= heading['level']: end_line = next_heading['line'] - 1 section_content = '\n'.join(lines[start_line - 1:end_line]) sections.append({ 'heading': heading['text'], 'level': heading['level'], 'start_line': start_line, 'end_line': end_line, 'line_count': end_line - start_line + 1, 'content_preview': section_content[:200] + '...' if len(section_content) > 200 else section_content }) return sections def generate_reference_mapping(index_file: str, output_file: str): """Generate comprehensive reference mapping.""" # Load existing index with open(index_file, 'r', encoding='utf-8') as f: index_data = json.load(f) reference_map = { 'metadata': { 'total_files': len(index_data['index']), 'generated_at': str(Path(__file__).stat().st_mtime) }, 'by_file': {}, 'by_heading': defaultdict(list), 'by_category': defaultdict(list), 'cross_references': defaultdict(list) } # Process each file for file_path, file_data in index_data['index'].items(): file_path_obj = Path(file_path) # Read full content for detailed analysis try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: full_content = f.read() except Exception as e: print(f"Warning: Could not read {file_path}: {e}") continue # Extract detailed information headings = extract_headings_with_lines(full_content) code_refs = extract_code_references(full_content) sections = extract_sections(full_content, headings) # Categorize file category = 'other' if file_path.startswith('docs/'): parts = file_path.split('/') if len(parts) > 1: if parts[1] in ['api', 'architecture', 'proxmox', 'runbooks', 'status', 'archive']: category = parts[1] else: category = 'docs' else: category = 'docs' elif file_path.startswith('api/'): category = 'api' elif file_path.startswith('portal/'): category = 'portal' # Build file entry file_entry = { 'path': file_path, 'title': file_data.get('title', ''), 'category': category, 'line_count': file_data['line_count'], 'size_bytes': file_data['size_bytes'], 'headings': headings, 'sections': sections, 'code_references': code_refs, 'links': file_data.get('links', []), 'code_blocks': file_data.get('code_blocks', 0) } reference_map['by_file'][file_path] = file_entry # Index by heading for heading in headings: reference_map['by_heading'][heading['text'].lower()].append({ 'file': file_path, 'line': heading['line'], 'level': heading['level'] }) # Index by category reference_map['by_category'][category].append(file_path) # Extract cross-references (links to other markdown files) for link in file_data.get('links', []): link_target = link.get('url', '') if link_target.endswith('.md') or link_target.endswith('.md#'): # Normalize link target if link_target.startswith('./'): link_target = str(file_path_obj.parent / link_target[2:]) elif link_target.startswith('../'): link_target = str(file_path_obj.parent.parent / link_target[3:]) reference_map['cross_references'][file_path].append({ 'target': link_target, 'text': link.get('text', ''), 'line': link.get('line', 0) }) # Save reference mapping with open(output_file, 'w', encoding='utf-8') as f: json.dump(reference_map, f, indent=2, ensure_ascii=False) # Generate human-readable report report_file = output_file.replace('.json', '.md') generate_markdown_report(reference_map, report_file) print(f"Reference mapping saved to: {output_file}") print(f"Human-readable report saved to: {report_file}") return reference_map def generate_markdown_report(reference_map: Dict, output_file: str): """Generate human-readable Markdown report.""" with open(output_file, 'w', encoding='utf-8') as f: f.write("# Markdown Reference Index\n\n") f.write(f"**Generated**: {reference_map['metadata']['generated_at']}\n") f.write(f"**Total Files**: {reference_map['metadata']['total_files']}\n\n") f.write("---\n\n") # Files by category f.write("## Files by Category\n\n") for category in sorted(reference_map['by_category'].keys()): files = reference_map['by_category'][category] f.write(f"### {category} ({len(files)} files)\n\n") for file_path in sorted(files)[:20]: file_entry = reference_map['by_file'][file_path] f.write(f"- [{file_entry['title'] or file_path}](./{file_path}) - {file_entry['line_count']} lines\n") if len(files) > 20: f.write(f" *... and {len(files) - 20} more files*\n") f.write("\n") # Heading index f.write("## Heading Index\n\n") f.write("*Top 50 most common headings*\n\n") heading_counts = [(h, len(refs)) for h, refs in reference_map['by_heading'].items()] heading_counts.sort(key=lambda x: x[1], reverse=True) for heading, count in heading_counts[:50]: refs = reference_map['by_heading'][heading] f.write(f"### {heading} ({count} occurrences)\n\n") for ref in refs[:5]: f.write(f"- Line {ref['line']}: [{ref['file']}](./{ref['file']}#{heading.lower().replace(' ', '-')})\n") if len(refs) > 5: f.write(f" *... and {len(refs) - 5} more occurrences*\n") f.write("\n") # File details f.write("## File Details\n\n") f.write("*Files with headings and line numbers*\n\n") for file_path in sorted(reference_map['by_file'].keys())[:30]: file_entry = reference_map['by_file'][file_path] f.write(f"### {file_path}\n\n") f.write(f"**Title**: {file_entry['title'] or 'N/A'}\n") f.write(f"**Lines**: {file_entry['line_count']}\n") f.write(f"**Headings**: {len(file_entry['headings'])}\n\n") if file_entry['headings']: f.write("**Headings**:\n") for heading in file_entry['headings'][:10]: indent = ' ' * (heading['level'] - 1) f.write(f"{indent}- Line {heading['line']}: {heading['text']}\n") if len(file_entry['headings']) > 10: f.write(f" *... and {len(file_entry['headings']) - 10} more headings*\n") f.write("\n") if __name__ == '__main__': import sys index_file = 'docs/MARKDOWN_INDEX.json' output_file = 'docs/MARKDOWN_REFERENCE.json' if len(sys.argv) > 1: index_file = sys.argv[1] if len(sys.argv) > 2: output_file = sys.argv[2] reference_map = generate_reference_mapping(index_file, output_file) print("\nReference mapping generation complete!")