309 lines
12 KiB
Python
Executable File
309 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Content Inconsistency Checker
|
|
Compares related markdown files for inconsistencies in:
|
|
- Dates
|
|
- Status information
|
|
- Configuration values
|
|
- References to other files
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Set, Tuple
|
|
from datetime import datetime
|
|
|
|
class ContentInconsistencyChecker:
|
|
def __init__(self, root_dir: str):
|
|
self.root_dir = Path(root_dir)
|
|
self.inconsistencies = []
|
|
self.file_contents = {}
|
|
|
|
def check(self):
|
|
"""Run all consistency checks"""
|
|
print("🔍 Checking content inconsistencies...")
|
|
|
|
# Load file contents
|
|
self._load_files()
|
|
|
|
# Check for inconsistencies
|
|
print("\n📅 Checking date inconsistencies...")
|
|
self._check_dates()
|
|
|
|
print("\n📊 Checking status inconsistencies...")
|
|
self._check_status()
|
|
|
|
print("\n🔗 Checking cross-references...")
|
|
self._check_references()
|
|
|
|
print("\n⚙️ Checking configuration values...")
|
|
self._check_config_values()
|
|
|
|
print("\n📝 Checking duplicate content...")
|
|
self._check_duplicate_content()
|
|
|
|
return self._generate_report()
|
|
|
|
def _load_files(self):
|
|
"""Load markdown file contents"""
|
|
exclude_dirs = {'.git', 'node_modules', '__pycache__', '.next', 'dist', 'build', 'venv'}
|
|
|
|
for md_file in self.root_dir.rglob('*.md'):
|
|
if any(part in exclude_dirs for part in md_file.parts):
|
|
continue
|
|
|
|
try:
|
|
with open(md_file, 'r', encoding='utf-8', errors='ignore') as f:
|
|
content = f.read()
|
|
rel_path = str(md_file.relative_to(self.root_dir))
|
|
self.file_contents[rel_path] = {
|
|
'content': content,
|
|
'path': rel_path,
|
|
'lines': content.split('\n')
|
|
}
|
|
except Exception as e:
|
|
pass
|
|
|
|
def _check_dates(self):
|
|
"""Check for inconsistent dates"""
|
|
date_patterns = [
|
|
r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD
|
|
r'(\d{1,2}/\d{1,2}/\d{4})', # MM/DD/YYYY
|
|
r'Date[:\s]+(\d{4}-\d{2}-\d{2})',
|
|
r'Generated[:\s]+(\d{4}-\d{2}-\d{2})',
|
|
r'Last Updated[:\s]+(\d{4}-\d{2}-\d{2})',
|
|
]
|
|
|
|
# Group files by project/component
|
|
project_files = defaultdict(list)
|
|
for path in self.file_contents:
|
|
if 'rpc-translator-138' in path:
|
|
project_files['rpc-translator-138'].append(path)
|
|
elif path.startswith('docs/'):
|
|
project_files['docs'].append(path)
|
|
elif path.startswith('reports/'):
|
|
project_files['reports'].append(path)
|
|
elif '/' not in path or path.count('/') == 0:
|
|
project_files['root'].append(path)
|
|
|
|
# Check dates within each project
|
|
for project, files in project_files.items():
|
|
dates_found = []
|
|
for file_path in files:
|
|
content = self.file_contents[file_path]['content']
|
|
for pattern in date_patterns:
|
|
matches = re.findall(pattern, content)
|
|
for match in matches:
|
|
dates_found.append((file_path, match))
|
|
|
|
# Check for very old dates (>1 year)
|
|
now = datetime.now()
|
|
for file_path, date_str in dates_found:
|
|
try:
|
|
if '-' in date_str:
|
|
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
|
|
elif '/' in date_str:
|
|
parts = date_str.split('/')
|
|
if len(parts) == 3:
|
|
date_obj = datetime.strptime(date_str, '%m/%d/%Y')
|
|
else:
|
|
continue
|
|
else:
|
|
continue
|
|
|
|
days_diff = (now - date_obj).days
|
|
if days_diff > 365:
|
|
self.inconsistencies.append({
|
|
'type': 'old_date',
|
|
'file': file_path,
|
|
'issue': f'Date {date_str} is {days_diff} days old',
|
|
'severity': 'medium'
|
|
})
|
|
except:
|
|
pass
|
|
|
|
def _check_status(self):
|
|
"""Check for inconsistent status information"""
|
|
status_patterns = [
|
|
r'Status[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING|ACTIVE|INACTIVE)',
|
|
r'\*\*Status\*\*[:\s]+([✅❌🔄⚠️]+|COMPLETE|INCOMPLETE|PENDING)',
|
|
]
|
|
|
|
# Group related status files
|
|
status_groups = defaultdict(list)
|
|
for path in self.file_contents:
|
|
filename = Path(path).name
|
|
if 'COMPLETE' in filename or 'STATUS' in filename or 'FINAL' in filename:
|
|
# Extract base name
|
|
base = re.sub(r'_(COMPLETE|FINAL|STATUS).*', '', filename)
|
|
base = re.sub(r'COMPLETE|FINAL|STATUS', '', base)
|
|
status_groups[base].append(path)
|
|
|
|
# Check for conflicting statuses
|
|
for base, files in status_groups.items():
|
|
if len(files) > 1:
|
|
statuses = []
|
|
for file_path in files:
|
|
content = self.file_contents[file_path]['content']
|
|
for pattern in status_patterns:
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
statuses.extend([(file_path, m) for m in matches])
|
|
|
|
if len(set(s[1] for s in statuses)) > 1:
|
|
self.inconsistencies.append({
|
|
'type': 'conflicting_status',
|
|
'files': files,
|
|
'issue': f'Multiple status files for {base} with different statuses',
|
|
'severity': 'high'
|
|
})
|
|
|
|
def _check_references(self):
|
|
"""Check for broken or inconsistent cross-references"""
|
|
reference_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
|
|
|
for path, data in self.file_contents.items():
|
|
content = data['content']
|
|
matches = re.findall(reference_pattern, content)
|
|
|
|
for link_text, link_path in matches:
|
|
# Skip external links
|
|
if link_path.startswith('http'):
|
|
continue
|
|
|
|
# Check if referenced file exists
|
|
if '#' in link_path:
|
|
file_path, anchor = link_path.split('#', 1)
|
|
else:
|
|
file_path = link_path
|
|
anchor = None
|
|
|
|
# Resolve relative paths
|
|
if not file_path.startswith('/'):
|
|
current_dir = Path(path).parent
|
|
resolved = (current_dir / file_path).resolve()
|
|
try:
|
|
relative_resolved = resolved.relative_to(self.root_dir)
|
|
except ValueError:
|
|
# Path is outside project root, skip
|
|
continue
|
|
else:
|
|
relative_resolved = Path(file_path.lstrip('/'))
|
|
|
|
# Check if file exists
|
|
full_path = self.root_dir / relative_resolved
|
|
if not full_path.exists():
|
|
self.inconsistencies.append({
|
|
'type': 'broken_reference',
|
|
'file': path,
|
|
'issue': f'Broken link to {link_path}',
|
|
'severity': 'medium'
|
|
})
|
|
|
|
def _check_config_values(self):
|
|
"""Check for inconsistent configuration values"""
|
|
# Look for IP addresses, VMIDs, ports
|
|
ip_pattern = r'192\.168\.11\.(\d+)'
|
|
vmid_pattern = r'VMID[:\s]+(\d+)'
|
|
|
|
configs_by_component = defaultdict(lambda: defaultdict(set))
|
|
|
|
for path, data in self.file_contents.items():
|
|
content = data['content']
|
|
|
|
# Extract IPs
|
|
ips = re.findall(ip_pattern, content)
|
|
for ip in ips:
|
|
component = self._identify_component(path)
|
|
configs_by_component[component]['ips'].add(f'192.168.11.{ip}')
|
|
|
|
# Extract VMIDs
|
|
vmids = re.findall(vmid_pattern, content, re.IGNORECASE)
|
|
for vmid in vmids:
|
|
component = self._identify_component(path)
|
|
configs_by_component[component]['vmids'].add(vmid)
|
|
|
|
# Check for inconsistencies (same component, different values)
|
|
for component, configs in configs_by_component.items():
|
|
if len(configs['ips']) > 10: # Too many IPs might indicate inconsistency
|
|
self.inconsistencies.append({
|
|
'type': 'too_many_ips',
|
|
'component': component,
|
|
'issue': f'Component {component} references {len(configs["ips"])} different IPs',
|
|
'severity': 'low'
|
|
})
|
|
|
|
def _check_duplicate_content(self):
|
|
"""Check for duplicate or near-duplicate content"""
|
|
# Simple check: files with very similar first 10 lines
|
|
file_signatures = {}
|
|
|
|
for path, data in self.file_contents.items():
|
|
first_lines = '\n'.join(data['lines'][:10])
|
|
signature = hash(first_lines)
|
|
|
|
if signature in file_signatures:
|
|
self.inconsistencies.append({
|
|
'type': 'duplicate_intro',
|
|
'files': [file_signatures[signature], path],
|
|
'issue': 'Files have identical first 10 lines',
|
|
'severity': 'low'
|
|
})
|
|
else:
|
|
file_signatures[signature] = path
|
|
|
|
def _identify_component(self, path: str) -> str:
|
|
"""Identify component from file path"""
|
|
if 'rpc-translator' in path:
|
|
return 'rpc-translator-138'
|
|
elif 'besu' in path.lower():
|
|
return 'besu'
|
|
elif 'dbis' in path.lower():
|
|
return 'dbis'
|
|
elif 'firefly' in path.lower():
|
|
return 'firefly'
|
|
else:
|
|
return 'other'
|
|
|
|
def _generate_report(self) -> Dict:
|
|
"""Generate inconsistency report"""
|
|
report = {
|
|
'summary': {
|
|
'total_inconsistencies': len(self.inconsistencies),
|
|
'by_type': defaultdict(int),
|
|
'by_severity': defaultdict(int)
|
|
},
|
|
'inconsistencies': []
|
|
}
|
|
|
|
for inc in self.inconsistencies:
|
|
report['summary']['by_type'][inc['type']] += 1
|
|
report['summary']['by_severity'][inc['severity']] += 1
|
|
report['inconsistencies'].append(inc)
|
|
|
|
return report
|
|
|
|
def main():
|
|
root_dir = Path(__file__).parent.parent
|
|
checker = ContentInconsistencyChecker(root_dir)
|
|
report = checker.check()
|
|
|
|
# Save report
|
|
json_file = root_dir / 'CONTENT_INCONSISTENCIES.json'
|
|
with open(json_file, 'w') as f:
|
|
json.dump(report, f, indent=2, default=str)
|
|
print(f"\n✅ Report saved to: {json_file}")
|
|
|
|
# Print summary
|
|
print("\n📊 Summary:")
|
|
print(f" Total inconsistencies: {report['summary']['total_inconsistencies']}")
|
|
print(f" By type: {dict(report['summary']['by_type'])}")
|
|
print(f" By severity: {dict(report['summary']['by_severity'])}")
|
|
|
|
return report
|
|
|
|
if __name__ == '__main__':
|
|
main()
|