#!/usr/bin/env python3 """ Content Inconsistency Checker Compares related markdown files for inconsistencies in: - Dates - Status information - Configuration values - References to other files """ import os import re import json from pathlib import Path from collections import defaultdict from typing import Dict, List, Set, Tuple from datetime import datetime class ContentInconsistencyChecker: def __init__(self, root_dir: str): self.root_dir = Path(root_dir) self.inconsistencies = [] self.file_contents = {} def check(self): """Run all consistency checks""" print("šŸ” Checking content inconsistencies...") # Load file contents self._load_files() # Check for inconsistencies print("\nšŸ“… Checking date inconsistencies...") self._check_dates() print("\nšŸ“Š Checking status inconsistencies...") self._check_status() print("\nšŸ”— Checking cross-references...") self._check_references() print("\nāš™ļø Checking configuration values...") self._check_config_values() print("\nšŸ“ Checking duplicate content...") self._check_duplicate_content() return self._generate_report() def _load_files(self): """Load markdown file contents""" exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv'} for md_file in self.root_dir.rglob('*.md'): if any(part in exclude_dirs for part in md_file.parts): continue try: with open(md_file, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() rel_path = str(md_file.relative_to(self.root_dir)) self.file_contents[rel_path] = { 'content': content, 'path': rel_path, 'lines': content.split('\n') } except Exception as e: pass def _check_dates(self): """Check for inconsistent dates""" date_patterns = [ r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD r'(\d{1,2}/\d{1,2}/\d{4})', # MM/DD/YYYY r'Date[:\s]+(\d{4}-\d{2}-\d{2})', r'Generated[:\s]+(\d{4}-\d{2}-\d{2})', r'Last Updated[:\s]+(\d{4}-\d{2}-\d{2})', ] # Group files by project/component project_files = defaultdict(list) for path in self.file_contents: if 'rpc-translator-138' in path: project_files['rpc-translator-138'].append(path) elif path.startswith('docs/'): project_files['docs'].append(path) elif path.startswith('reports/'): project_files['reports'].append(path) elif '/' not in path or path.count('/') == 0: project_files['root'].append(path) # Check dates within each project for project, files in project_files.items(): dates_found = [] for file_path in files: content = self.file_contents[file_path]['content'] for pattern in date_patterns: matches = re.findall(pattern, content) for match in matches: dates_found.append((file_path, match)) # Check for very old dates (>1 year) now = datetime.now() for file_path, date_str in dates_found: try: if '-' in date_str: date_obj = datetime.strptime(date_str, '%Y-%m-%d') elif '/' in date_str: parts = date_str.split('/') if len(parts) == 3: date_obj = datetime.strptime(date_str, '%m/%d/%Y') else: continue else: continue days_diff = (now - date_obj).days if days_diff > 365: self.inconsistencies.append({ 'type': 'old_date', 'file': file_path, 'issue': f'Date {date_str} is {days_diff} days old', 'severity': 'medium' }) except: pass def _check_status(self): """Check for inconsistent status information""" status_patterns = [ r'Status[:\s]+([āœ…āŒšŸ”„āš ļø]+|COMPLETE|INCOMPLETE|PENDING|ACTIVE|INACTIVE)', r'\*\*Status\*\*[:\s]+([āœ…āŒšŸ”„āš ļø]+|COMPLETE|INCOMPLETE|PENDING)', ] # Group related status files status_groups = defaultdict(list) for path in self.file_contents: filename = Path(path).name if 'COMPLETE' in filename or 'STATUS' in filename or 'FINAL' in filename: # Extract base name base = re.sub(r'_(COMPLETE|FINAL|STATUS).*', '', filename) base = re.sub(r'COMPLETE|FINAL|STATUS', '', base) status_groups[base].append(path) # Check for conflicting statuses for base, files in status_groups.items(): if len(files) > 1: statuses = [] for file_path in files: content = self.file_contents[file_path]['content'] for pattern in status_patterns: matches = re.findall(pattern, content, re.IGNORECASE) statuses.extend([(file_path, m) for m in matches]) if len(set(s[1] for s in statuses)) > 1: self.inconsistencies.append({ 'type': 'conflicting_status', 'files': files, 'issue': f'Multiple status files for {base} with different statuses', 'severity': 'high' }) def _check_references(self): """Check for broken or inconsistent cross-references""" reference_pattern = r'\[([^\]]+)\]\(([^)]+)\)' for path, data in self.file_contents.items(): content = data['content'] matches = re.findall(reference_pattern, content) for link_text, link_path in matches: # Skip external links if link_path.startswith('http'): continue # Check if referenced file exists if '#' in link_path: file_path, anchor = link_path.split('#', 1) else: file_path = link_path anchor = None # Resolve relative paths if not file_path.startswith('/'): current_dir = Path(path).parent resolved = (current_dir / file_path).resolve() try: relative_resolved = resolved.relative_to(self.root_dir) except ValueError: # Path is outside project root, skip continue else: relative_resolved = Path(file_path.lstrip('/')) # Check if file exists full_path = self.root_dir / relative_resolved if not full_path.exists(): self.inconsistencies.append({ 'type': 'broken_reference', 'file': path, 'issue': f'Broken link to {link_path}', 'severity': 'medium' }) def _check_config_values(self): """Check for inconsistent configuration values""" # Look for IP addresses, VMIDs, ports ip_pattern = r'192\.168\.11\.(\d+)' vmid_pattern = r'VMID[:\s]+(\d+)' configs_by_component = defaultdict(lambda: defaultdict(set)) for path, data in self.file_contents.items(): content = data['content'] # Extract IPs ips = re.findall(ip_pattern, content) for ip in ips: component = self._identify_component(path) configs_by_component[component]['ips'].add(f'192.168.11.{ip}') # Extract VMIDs vmids = re.findall(vmid_pattern, content, re.IGNORECASE) for vmid in vmids: component = self._identify_component(path) configs_by_component[component]['vmids'].add(vmid) # Check for inconsistencies (same component, different values) for component, configs in configs_by_component.items(): if len(configs['ips']) > 10: # Too many IPs might indicate inconsistency self.inconsistencies.append({ 'type': 'too_many_ips', 'component': component, 'issue': f'Component {component} references {len(configs["ips"])} different IPs', 'severity': 'low' }) def _check_duplicate_content(self): """Check for duplicate or near-duplicate content""" # Simple check: files with very similar first 10 lines file_signatures = {} for path, data in self.file_contents.items(): first_lines = '\n'.join(data['lines'][:10]) signature = hash(first_lines) if signature in file_signatures: self.inconsistencies.append({ 'type': 'duplicate_intro', 'files': [file_signatures[signature], path], 'issue': 'Files have identical first 10 lines', 'severity': 'low' }) else: file_signatures[signature] = path def _identify_component(self, path: str) -> str: """Identify component from file path""" if 'rpc-translator' in path: return 'rpc-translator-138' elif 'besu' in path.lower(): return 'besu' elif 'dbis' in path.lower(): return 'dbis' elif 'firefly' in path.lower(): return 'firefly' else: return 'other' def _generate_report(self) -> Dict: """Generate inconsistency report""" report = { 'summary': { 'total_inconsistencies': len(self.inconsistencies), 'by_type': defaultdict(int), 'by_severity': defaultdict(int) }, 'inconsistencies': [] } for inc in self.inconsistencies: report['summary']['by_type'][inc['type']] += 1 report['summary']['by_severity'][inc['severity']] += 1 report['inconsistencies'].append(inc) return report def main(): root_dir = Path(__file__).parent.parent checker = ContentInconsistencyChecker(root_dir) report = checker.check() # Save report json_file = root_dir / 'CONTENT_INCONSISTENCIES.json' with open(json_file, 'w') as f: json.dump(report, f, indent=2, default=str) print(f"\nāœ… Report saved to: {json_file}") # Print summary print("\nšŸ“Š Summary:") print(f" Total inconsistencies: {report['summary']['total_inconsistencies']}") print(f" By type: {dict(report['summary']['by_type'])}") print(f" By severity: {dict(report['summary']['by_severity'])}") return report if __name__ == '__main__': main()