Sankofa/scripts/generate-markdown-reference.py

#!/usr/bin/env python3
"""
Generate Markdown Reference Index
Creates a comprehensive reference mapping Markdown content to source files and line numbers.
"""

import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
from collections import defaultdict

def extract_headings_with_lines(content: str) -> List[Dict]:
    """Extract all headings with their line numbers."""
    headings = []
    for line_num, line in enumerate(content.split('\n'), 1):
        match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
        if match:
            level = len(match.group(1))
            text = match.group(2).strip()
            headings.append({
                'level': level,
                'text': text,
                'line': line_num
            })
    return headings

def extract_code_references(content: str) -> List[Dict]:
    """Extract code references (file paths, function names, etc.)."""
    references = []

    # Pattern for code references: file paths, function names, etc.
    patterns = [
        (r'`([^`]+\.(ts|tsx|js|jsx|go|py|sql|yaml|yml|json))`', 'file'),
        (r'`([a-zA-Z_][a-zA-Z0-9_]*\([^)]*\))`', 'function'),
        (r'\[([^\]]+)\]\(([^\)]+)\)', 'link'),
        (r'`([A-Z_][A-Z0-9_]+)`', 'constant'),
    ]

    for line_num, line in enumerate(content.split('\n'), 1):
        for pattern, ref_type in patterns:
            for match in re.finditer(pattern, line):
                if ref_type == 'link':
                    references.append({
                        'type': ref_type,
                        'text': match.group(1),
                        'target': match.group(2),
                        'line': line_num
                    })
                else:
                    references.append({
                        'type': ref_type,
                        'value': match.group(1),
                        'line': line_num
                    })

    return references

def extract_sections(content: str, headings: List[Dict]) -> List[Dict]:
    """Extract content sections based on headings."""
    sections = []
    lines = content.split('\n')

    for i, heading in enumerate(headings):
        start_line = heading['line']
        # Find end of section (next heading of same or higher level, or end of file)
        end_line = len(lines)

        if i < len(headings) - 1:
            next_heading = headings[i + 1]
            # Only stop at headings of same or higher level
            if next_heading['level'] <= heading['level']:
                end_line = next_heading['line'] - 1

        section_content = '\n'.join(lines[start_line - 1:end_line])

        sections.append({
            'heading': heading['text'],
            'level': heading['level'],
            'start_line': start_line,
            'end_line': end_line,
            'line_count': end_line - start_line + 1,
            'content_preview': section_content[:200] + '...' if len(section_content) > 200 else section_content
        })

    return sections

def generate_reference_mapping(index_file: str, output_file: str):
    """Generate comprehensive reference mapping."""

    # Load existing index
    with open(index_file, 'r', encoding='utf-8') as f:
        index_data = json.load(f)

    reference_map = {
        'metadata': {
            'total_files': len(index_data['index']),
            'generated_at': str(Path(__file__).stat().st_mtime)
        },
        'by_file': {},
        'by_heading': defaultdict(list),
        'by_category': defaultdict(list),
        'cross_references': defaultdict(list)
    }

    # Process each file
    for file_path, file_data in index_data['index'].items():
        file_path_obj = Path(file_path)

        # Read full content for detailed analysis
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                full_content = f.read()
        except Exception as e:
            print(f"Warning: Could not read {file_path}: {e}")
            continue

        # Extract detailed information
        headings = extract_headings_with_lines(full_content)
        code_refs = extract_code_references(full_content)
        sections = extract_sections(full_content, headings)

        # Categorize file
        category = 'other'
        if file_path.startswith('docs/'):
            parts = file_path.split('/')
            if len(parts) > 1:
                if parts[1] in ['api', 'architecture', 'proxmox', 'runbooks', 'status', 'archive']:
                    category = parts[1]
                else:
                    category = 'docs'
            else:
                category = 'docs'
        elif file_path.startswith('api/'):
            category = 'api'
        elif file_path.startswith('portal/'):
            category = 'portal'

        # Build file entry
        file_entry = {
            'path': file_path,
            'title': file_data.get('title', ''),
            'category': category,
            'line_count': file_data['line_count'],
            'size_bytes': file_data['size_bytes'],
            'headings': headings,
            'sections': sections,
            'code_references': code_refs,
            'links': file_data.get('links', []),
            'code_blocks': file_data.get('code_blocks', 0)
        }

        reference_map['by_file'][file_path] = file_entry

        # Index by heading
        for heading in headings:
            reference_map['by_heading'][heading['text'].lower()].append({
                'file': file_path,
                'line': heading['line'],
                'level': heading['level']
            })

        # Index by category
        reference_map['by_category'][category].append(file_path)

        # Extract cross-references (links to other markdown files)
        for link in file_data.get('links', []):
            link_target = link.get('url', '')
            if link_target.endswith('.md') or link_target.endswith('.md#'):
                # Normalize link target
                if link_target.startswith('./'):
                    link_target = str(file_path_obj.parent / link_target[2:])
                elif link_target.startswith('../'):
                    link_target = str(file_path_obj.parent.parent / link_target[3:])

                reference_map['cross_references'][file_path].append({
                    'target': link_target,
                    'text': link.get('text', ''),
                    'line': link.get('line', 0)
                })

    # Save reference mapping
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(reference_map, f, indent=2, ensure_ascii=False)

    # Generate human-readable report
    report_file = output_file.replace('.json', '.md')
    generate_markdown_report(reference_map, report_file)

    print(f"Reference mapping saved to: {output_file}")
    print(f"Human-readable report saved to: {report_file}")

    return reference_map

def generate_markdown_report(reference_map: Dict, output_file: str):
    """Generate human-readable Markdown report."""

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Markdown Reference Index\n\n")
        f.write(f"**Generated**: {reference_map['metadata']['generated_at']}\n")
        f.write(f"**Total Files**: {reference_map['metadata']['total_files']}\n\n")
        f.write("---\n\n")

        # Files by category
        f.write("## Files by Category\n\n")
        for category in sorted(reference_map['by_category'].keys()):
            files = reference_map['by_category'][category]
            f.write(f"### {category} ({len(files)} files)\n\n")
            for file_path in sorted(files)[:20]:
                file_entry = reference_map['by_file'][file_path]
                f.write(f"- [{file_entry['title'] or file_path}](./{file_path}) - {file_entry['line_count']} lines\n")
            if len(files) > 20:
                f.write(f"  *... and {len(files) - 20} more files*\n")
            f.write("\n")

        # Heading index
        f.write("## Heading Index\n\n")
        f.write("*Top 50 most common headings*\n\n")
        heading_counts = [(h, len(refs)) for h, refs in reference_map['by_heading'].items()]
        heading_counts.sort(key=lambda x: x[1], reverse=True)

        for heading, count in heading_counts[:50]:
            refs = reference_map['by_heading'][heading]
            f.write(f"### {heading} ({count} occurrences)\n\n")
            for ref in refs[:5]:
                f.write(f"- Line {ref['line']}: [{ref['file']}](./{ref['file']}#{heading.lower().replace(' ', '-')})\n")
            if len(refs) > 5:
                f.write(f"  *... and {len(refs) - 5} more occurrences*\n")
            f.write("\n")

        # File details
        f.write("## File Details\n\n")
        f.write("*Files with headings and line numbers*\n\n")

        for file_path in sorted(reference_map['by_file'].keys())[:30]:
            file_entry = reference_map['by_file'][file_path]
            f.write(f"### {file_path}\n\n")
            f.write(f"**Title**: {file_entry['title'] or 'N/A'}\n")
            f.write(f"**Lines**: {file_entry['line_count']}\n")
            f.write(f"**Headings**: {len(file_entry['headings'])}\n\n")

            if file_entry['headings']:
                f.write("**Headings**:\n")
                for heading in file_entry['headings'][:10]:
                    indent = '  ' * (heading['level'] - 1)
                    f.write(f"{indent}- Line {heading['line']}: {heading['text']}\n")
                if len(file_entry['headings']) > 10:
                    f.write(f"  *... and {len(file_entry['headings']) - 10} more headings*\n")
            f.write("\n")

if __name__ == '__main__':
    import sys

    index_file = 'docs/MARKDOWN_INDEX.json'
    output_file = 'docs/MARKDOWN_REFERENCE.json'

    if len(sys.argv) > 1:
        index_file = sys.argv[1]
    if len(sys.argv) > 2:
        output_file = sys.argv[2]

    reference_map = generate_reference_mapping(index_file, output_file)
    print("\nReference mapping generation complete!")