Sankofa/scripts/analyze-markdown.py

#!/usr/bin/env python3
"""
Markdown Analysis Script
Analyzes all Markdown files for duplicates and generates an index mapping content to files and line numbers.
"""

import os
import hashlib
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Set
import json

class MarkdownAnalyzer:
    def __init__(self, root_dir: str = '.'):
        self.root_dir = Path(root_dir)
        self.md_files: List[Path] = []
        self.content_index: Dict[str, Dict] = {}
        self.duplicates: Dict[str, List[str]] = defaultdict(list)
        self.file_structure: Dict[str, List[str]] = defaultdict(list)

    def find_all_markdown(self):
        """Find all markdown files in the project."""
        for md_file in self.root_dir.rglob('*.md'):
            # Skip node_modules, .git, and other ignored directories
            parts = md_file.parts
            if any(ignore in parts for ignore in ['node_modules', '.git', 'dist', 'build', '.next']):
                continue
            self.md_files.append(md_file)

    def analyze_duplicates(self):
        """Find duplicate files by content hash."""
        content_hashes = defaultdict(list)

        for md_file in self.md_files:
            try:
                with open(md_file, 'rb') as f:
                    content = f.read()
                    content_hash = hashlib.md5(content).hexdigest()
                    rel_path = str(md_file.relative_to(self.root_dir))
                    content_hashes[content_hash].append(rel_path)
            except Exception as e:
                print(f"Error reading {md_file}: {e}")

        # Find duplicates
        for content_hash, files in content_hashes.items():
            if len(files) > 1:
                self.duplicates[content_hash] = files

    def index_content(self):
        """Create detailed index of markdown content with line numbers."""
        for md_file in self.md_files:
            rel_path = str(md_file.relative_to(self.root_dir))

            try:
                with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
                    lines = f.readlines()

                # Extract metadata
                title = None
                headings = []
                code_blocks = []
                links = []

                for line_num, line in enumerate(lines, 1):
                    # Find title (first H1)
                    if not title and line.strip().startswith('# '):
                        title = line.strip()[2:].strip()

                    # Find all headings
                    heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
                    if heading_match:
                        level = len(heading_match.group(1))
                        heading_text = heading_match.group(2).strip()
                        headings.append({
                            'level': level,
                            'text': heading_text,
                            'line': line_num
                        })

                    # Find code blocks
                    if line.strip().startswith('```'):
                        code_blocks.append({
                            'line': line_num,
                            'type': 'code_block'
                        })

                    # Find links
                    link_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
                    for match in re.finditer(link_pattern, line):
                        links.append({
                            'text': match.group(1),
                            'url': match.group(2),
                            'line': line_num
                        })

                self.content_index[rel_path] = {
                    'path': rel_path,
                    'title': title,
                    'line_count': len(lines),
                    'headings': headings,
                    'code_blocks': len(code_blocks),
                    'links': links,
                    'size_bytes': md_file.stat().st_size
                }

            except Exception as e:
                print(f"Error indexing {md_file}: {e}")

    def categorize_files(self):
        """Categorize files by location."""
        for md_file in self.md_files:
            rel_path = str(md_file.relative_to(self.root_dir))
            parts = rel_path.split('/')

            if len(parts) == 1:
                category = 'root'
            elif parts[0] == 'docs':
                if len(parts) > 1:
                    category = f"docs/{parts[1]}"
                else:
                    category = 'docs'
            elif parts[0] in ['api', 'portal', 'scripts', 'crossplane-provider-proxmox']:
                category = parts[0]
            else:
                category = 'other'

            self.file_structure[category].append(rel_path)

    def generate_report(self) -> Dict:
        """Generate comprehensive analysis report."""
        return {
            'total_files': len(self.md_files),
            'unique_files': len(self.content_index),
            'duplicate_groups': len(self.duplicates),
            'duplicates': dict(self.duplicates),
            'categories': {k: len(v) for k, v in self.file_structure.items()},
            'index': self.content_index
        }

    def find_similar_content(self) -> Dict[str, List[str]]:
        """Find files with similar titles (potential duplicates)."""
        similar = defaultdict(list)

        for rel_path, data in self.content_index.items():
            if data['title']:
                title_key = data['title'].lower().strip()
                similar[title_key].append(rel_path)

        return {k: v for k, v in similar.items() if len(v) > 1}

def main():
    analyzer = MarkdownAnalyzer('.')

    print("Finding all Markdown files...")
    analyzer.find_all_markdown()
    print(f"Found {len(analyzer.md_files)} Markdown files\n")

    print("Analyzing duplicates...")
    analyzer.analyze_duplicates()
    print(f"Found {len(analyzer.duplicates)} duplicate groups\n")

    print("Indexing content...")
    analyzer.index_content()
    print(f"Indexed {len(analyzer.content_index)} files\n")

    print("Categorizing files...")
    analyzer.categorize_files()

    print("Finding similar content...")
    similar = analyzer.find_similar_content()

    # Generate report
    report = analyzer.generate_report()

    # Print summary
    print("\n" + "="*60)
    print("MARKDOWN ANALYSIS SUMMARY")
    print("="*60)
    print(f"Total Markdown files: {report['total_files']}")
    print(f"Unique files: {report['unique_files']}")
    print(f"Duplicate groups: {report['duplicate_groups']}")

    if report['duplicate_groups'] > 0:
        print("\nDuplicate files:")
        for hash_val, files in list(report['duplicates'].items())[:10]:
            print(f"\n  Hash: {hash_val[:16]}... ({len(files)} files)")
            for f in files:
                print(f"    - {f}")

    print(f"\nSimilar titles (potential duplicates): {len(similar)}")
    for title, files in list(similar.items())[:10]:
        print(f"\n  '{title}':")
        for f in files:
            print(f"    - {f}")

    print("\nFiles by category:")
    for category, count in sorted(report['categories'].items()):
        print(f"  {category}: {count} files")

    # Save detailed report
    output_file = 'docs/MARKDOWN_INDEX.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(f"\nDetailed index saved to: {output_file}")

    return analyzer, report

if __name__ == '__main__':
    analyzer, report = main()