proxmox/scripts/check-content-inconsistencies.py

#!/usr/bin/env python3
"""
Content Inconsistency Checker
Compares related markdown files for inconsistencies in:
- Dates
- Status information
- Configuration values
- References to other files
"""

import os
import re
import json
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Set, Tuple
from datetime import datetime

class ContentInconsistencyChecker:
    def __init__(self, root_dir: str):
        self.root_dir = Path(root_dir)
        self.inconsistencies = []
        self.file_contents = {}

    def check(self):
        """Run all consistency checks"""
        print("🔍 Checking content inconsistencies...")

        # Load file contents
        self._load_files()

        # Check for inconsistencies
        print("\n📅 Checking date inconsistencies...")
        self._check_dates()

        print("\n📊 Checking status inconsistencies...")
        self._check_status()

        print("\n🔗 Checking cross-references...")
        self._check_references()

        print("\n⚙️  Checking configuration values...")
        self._check_config_values()

        print("\n📝 Checking duplicate content...")
        self._check_duplicate_content()

        return self._generate_report()

    def _load_files(self):
        """Load markdown file contents"""
        exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv'}

        for md_file in self.root_dir.rglob('*.md'):
            if any(part in exclude_dirs for part in md_file.parts):
                continue

            try:
                with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    rel_path = str(md_file.relative_to(self.root_dir))
                    self.file_contents[rel_path] = {
                        'content': content,
                        'path': rel_path,
                        'lines': content.split('\n')
                    }
            except Exception as e:
                pass

    def _check_dates(self):
        """Check for inconsistent dates"""
        date_patterns = [
            r'(\d{4}-\d{2}-\d{2})',  # YYYY-MM-DD
            r'(\d{1,2}/\d{1,2}/\d{4})',  # MM/DD/YYYY
            r'Date[:\s]+(\d{4}-\d{2}-\d{2})',
            r'Generated[:\s]+(\d{4}-\d{2}-\d{2})',
            r'Last Updated[:\s]+(\d{4}-\d{2}-\d{2})',
        ]

        # Group files by project/component
        project_files = defaultdict(list)
        for path in self.file_contents:
            if 'rpc-translator-138' in path:
                project_files['rpc-translator-138'].append(path)
            elif path.startswith('docs/'):
                project_files['docs'].append(path)
            elif path.startswith('reports/'):
                project_files['reports'].append(path)
            elif '/' not in path or path.count('/') == 0:
                project_files['root'].append(path)

        # Check dates within each project
        for project, files in project_files.items():
            dates_found = []
            for file_path in files:
                content = self.file_contents[file_path]['content']
                for pattern in date_patterns:
                    matches = re.findall(pattern, content)
                    for match in matches:
                        dates_found.append((file_path, match))

            # Check for very old dates (>1 year)
            now = datetime.now()
            for file_path, date_str in dates_found:
                try:
                    if '-' in date_str:
                        date_obj = datetime.strptime(date_str, '%Y-%m-%d')
                    elif '/' in date_str:
                        parts = date_str.split('/')
                        if len(parts) == 3:
                            date_obj = datetime.strptime(date_str, '%m/%d/%Y')
                        else:
                            continue
                    else:
                        continue

                    days_diff = (now - date_obj).days
                    if days_diff > 365:
                        self.inconsistencies.append({
                            'type': 'old_date',
                            'file': file_path,
                            'issue': f'Date {date_str} is {days_diff} days old',
                            'severity': 'medium'
                        })
                except:
                    pass

    def _check_status(self):
        """Check for inconsistent status information"""
        status_patterns = [
            r'Status[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING|ACTIVE|INACTIVE)',
            r'\*\*Status\*\*[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING)',
        ]

        # Group related status files
        status_groups = defaultdict(list)
        for path in self.file_contents:
            filename = Path(path).name
            if 'COMPLETE' in filename or 'STATUS' in filename or 'FINAL' in filename:
                # Extract base name
                base = re.sub(r'_(COMPLETE|FINAL|STATUS).*', '', filename)
                base = re.sub(r'COMPLETE|FINAL|STATUS', '', base)
                status_groups[base].append(path)

        # Check for conflicting statuses
        for base, files in status_groups.items():
            if len(files) > 1:
                statuses = []
                for file_path in files:
                    content = self.file_contents[file_path]['content']
                    for pattern in status_patterns:
                        matches = re.findall(pattern, content, re.IGNORECASE)
                        statuses.extend([(file_path, m) for m in matches])

                if len(set(s[1] for s in statuses)) > 1:
                    self.inconsistencies.append({
                        'type': 'conflicting_status',
                        'files': files,
                        'issue': f'Multiple status files for {base} with different statuses',
                        'severity': 'high'
                    })

    def _check_references(self):
        """Check for broken or inconsistent cross-references"""
        reference_pattern = r'\[([^\]]+)\]\(([^)]+)\)'

        for path, data in self.file_contents.items():
            content = data['content']
            matches = re.findall(reference_pattern, content)

            for link_text, link_path in matches:
                # Skip external links
                if link_path.startswith('http'):
                    continue

                # Check if referenced file exists
                if '#' in link_path:
                    file_path, anchor = link_path.split('#', 1)
                else:
                    file_path = link_path
                    anchor = None

                # Resolve relative paths
                if not file_path.startswith('/'):
                    current_dir = Path(path).parent
                    resolved = (current_dir / file_path).resolve()
                    try:
                        relative_resolved = resolved.relative_to(self.root_dir)
                    except ValueError:
                        # Path is outside project root, skip
                        continue
                else:
                    relative_resolved = Path(file_path.lstrip('/'))

                # Check if file exists
                full_path = self.root_dir / relative_resolved
                if not full_path.exists():
                    self.inconsistencies.append({
                        'type': 'broken_reference',
                        'file': path,
                        'issue': f'Broken link to {link_path}',
                        'severity': 'medium'
                    })

    def _check_config_values(self):
        """Check for inconsistent configuration values"""
        # Look for IP addresses, VMIDs, ports
        ip_pattern = r'192\.168\.11\.(\d+)'
        vmid_pattern = r'VMID[:\s]+(\d+)'

        configs_by_component = defaultdict(lambda: defaultdict(set))

        for path, data in self.file_contents.items():
            content = data['content']

            # Extract IPs
            ips = re.findall(ip_pattern, content)
            for ip in ips:
                component = self._identify_component(path)
                configs_by_component[component]['ips'].add(f'192.168.11.{ip}')

            # Extract VMIDs
            vmids = re.findall(vmid_pattern, content, re.IGNORECASE)
            for vmid in vmids:
                component = self._identify_component(path)
                configs_by_component[component]['vmids'].add(vmid)

        # Check for inconsistencies (same component, different values)
        for component, configs in configs_by_component.items():
            if len(configs['ips']) > 10:  # Too many IPs might indicate inconsistency
                self.inconsistencies.append({
                    'type': 'too_many_ips',
                    'component': component,
                    'issue': f'Component {component} references {len(configs["ips"])} different IPs',
                    'severity': 'low'
                })

    def _check_duplicate_content(self):
        """Check for duplicate or near-duplicate content"""
        # Simple check: files with very similar first 10 lines
        file_signatures = {}

        for path, data in self.file_contents.items():
            first_lines = '\n'.join(data['lines'][:10])
            signature = hash(first_lines)

            if signature in file_signatures:
                self.inconsistencies.append({
                    'type': 'duplicate_intro',
                    'files': [file_signatures[signature], path],
                    'issue': 'Files have identical first 10 lines',
                    'severity': 'low'
                })
            else:
                file_signatures[signature] = path

    def _identify_component(self, path: str) -> str:
        """Identify component from file path"""
        if 'rpc-translator' in path:
            return 'rpc-translator-138'
        elif 'besu' in path.lower():
            return 'besu'
        elif 'dbis' in path.lower():
            return 'dbis'
        elif 'firefly' in path.lower():
            return 'firefly'
        else:
            return 'other'

    def _generate_report(self) -> Dict:
        """Generate inconsistency report"""
        report = {
            'summary': {
                'total_inconsistencies': len(self.inconsistencies),
                'by_type': defaultdict(int),
                'by_severity': defaultdict(int)
            },
            'inconsistencies': []
        }

        for inc in self.inconsistencies:
            report['summary']['by_type'][inc['type']] += 1
            report['summary']['by_severity'][inc['severity']] += 1
            report['inconsistencies'].append(inc)

        return report

def main():
    root_dir = Path(__file__).parent.parent
    checker = ContentInconsistencyChecker(root_dir)
    report = checker.check()

    # Save report
    json_file = root_dir / 'CONTENT_INCONSISTENCIES.json'
    with open(json_file, 'w') as f:
        json.dump(report, f, indent=2, default=str)
    print(f"\n✅ Report saved to: {json_file}")

    # Print summary
    print("\n📊 Summary:")
    print(f"  Total inconsistencies: {report['summary']['total_inconsistencies']}")
    print(f"  By type: {dict(report['summary']['by_type'])}")
    print(f"  By severity: {dict(report['summary']['by_severity'])}")

    return report

if __name__ == '__main__':
    main()