263 lines
8.6 KiB
Python
263 lines
8.6 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DBIS Cross-Reference Verification Script
|
||
|
|
Automated link verification and cross-reference checking
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
# Colors for terminal output
|
||
|
|
class Colors:
|
||
|
|
RED = '\033[0;31m'
|
||
|
|
GREEN = '\033[0;32m'
|
||
|
|
YELLOW = '\033[1;33m'
|
||
|
|
BLUE = '\033[0;34m'
|
||
|
|
NC = '\033[0m' # No Color
|
||
|
|
|
||
|
|
def find_markdown_files(root_dir):
|
||
|
|
"""Find all markdown files in the project."""
|
||
|
|
md_files = []
|
||
|
|
for root, dirs, files in os.walk(root_dir):
|
||
|
|
# Skip certain directories
|
||
|
|
dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__']]
|
||
|
|
for file in files:
|
||
|
|
if file.endswith('.md'):
|
||
|
|
md_files.append(os.path.join(root, file))
|
||
|
|
return md_files
|
||
|
|
|
||
|
|
def extract_links(content, file_path):
|
||
|
|
"""Extract all markdown links from content."""
|
||
|
|
links = []
|
||
|
|
# Pattern: [text](path) or [text](path#anchor)
|
||
|
|
pattern = r'\[([^\]]+)\]\(([^)]+)\)'
|
||
|
|
|
||
|
|
for match in re.finditer(pattern, content):
|
||
|
|
link_text = match.group(1)
|
||
|
|
link_path = match.group(2)
|
||
|
|
links.append({
|
||
|
|
'text': link_text,
|
||
|
|
'path': link_path,
|
||
|
|
'line': content[:match.start()].count('\n') + 1,
|
||
|
|
'source_file': file_path
|
||
|
|
})
|
||
|
|
|
||
|
|
return links
|
||
|
|
|
||
|
|
def resolve_path(link_path, source_file):
|
||
|
|
"""Resolve relative path to absolute path."""
|
||
|
|
source_dir = os.path.dirname(source_file)
|
||
|
|
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
|
||
|
|
# Skip external links
|
||
|
|
if link_path.startswith(('http://', 'https://', 'mailto:')):
|
||
|
|
return None, 'external'
|
||
|
|
|
||
|
|
# Handle anchor-only links
|
||
|
|
if link_path.startswith('#'):
|
||
|
|
return source_file, 'anchor'
|
||
|
|
|
||
|
|
# Split path and anchor
|
||
|
|
if '#' in link_path:
|
||
|
|
file_part, anchor = link_path.split('#', 1)
|
||
|
|
else:
|
||
|
|
file_part = link_path
|
||
|
|
anchor = None
|
||
|
|
|
||
|
|
# Resolve file path
|
||
|
|
if file_part.startswith('/'):
|
||
|
|
# Absolute path from project root
|
||
|
|
full_path = os.path.join(project_root, file_part.lstrip('/'))
|
||
|
|
elif file_part.startswith('../'):
|
||
|
|
# Relative path going up - resolve relative to source directory
|
||
|
|
full_path = os.path.normpath(os.path.join(source_dir, file_part))
|
||
|
|
# Ensure it's still within project root
|
||
|
|
if not os.path.commonpath([project_root, full_path]) == project_root:
|
||
|
|
# Path went outside project root, try from project root
|
||
|
|
# Remove ../ and resolve from project root
|
||
|
|
rel_path = file_part
|
||
|
|
while rel_path.startswith('../'):
|
||
|
|
rel_path = rel_path[3:]
|
||
|
|
full_path = os.path.join(project_root, rel_path)
|
||
|
|
else:
|
||
|
|
# Relative path in same directory or subdirectory
|
||
|
|
full_path = os.path.normpath(os.path.join(source_dir, file_part))
|
||
|
|
|
||
|
|
return full_path, anchor
|
||
|
|
|
||
|
|
def verify_link(link, project_root):
|
||
|
|
"""Verify if a link is valid."""
|
||
|
|
link_path = link['path']
|
||
|
|
source_file = link['source_file']
|
||
|
|
|
||
|
|
resolved_path, anchor = resolve_path(link_path, source_file)
|
||
|
|
|
||
|
|
if resolved_path is None:
|
||
|
|
return True, 'external', None # External links are considered valid
|
||
|
|
|
||
|
|
if resolved_path == 'anchor':
|
||
|
|
return True, 'anchor', None # Anchor-only links are valid
|
||
|
|
|
||
|
|
# Check if file exists
|
||
|
|
if os.path.isfile(resolved_path):
|
||
|
|
# If anchor exists, check if it's in the file (simplified check)
|
||
|
|
if anchor:
|
||
|
|
try:
|
||
|
|
with open(resolved_path, 'r', encoding='utf-8') as f:
|
||
|
|
content = f.read()
|
||
|
|
# Check for anchor in headings
|
||
|
|
anchor_pattern = rf'#+\s+.*{re.escape(anchor)}'
|
||
|
|
if re.search(anchor_pattern, content, re.IGNORECASE):
|
||
|
|
return True, 'valid', None
|
||
|
|
else:
|
||
|
|
return False, 'invalid_anchor', f"Anchor '{anchor}' not found"
|
||
|
|
except Exception as e:
|
||
|
|
return False, 'error', str(e)
|
||
|
|
return True, 'valid', None
|
||
|
|
else:
|
||
|
|
return False, 'missing_file', f"File not found: {resolved_path}"
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main verification function."""
|
||
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
project_root = os.path.dirname(script_dir)
|
||
|
|
report_file = os.path.join(project_root, 'CROSS_REFERENCE_VERIFICATION_REPORT.md')
|
||
|
|
|
||
|
|
print("=" * 50)
|
||
|
|
print("DBIS Cross-Reference Verification")
|
||
|
|
print("=" * 50)
|
||
|
|
print()
|
||
|
|
print(f"{Colors.BLUE}Project Root:{Colors.NC} {project_root}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Find all markdown files
|
||
|
|
print(f"{Colors.BLUE}Scanning markdown files...{Colors.NC}")
|
||
|
|
md_files = find_markdown_files(project_root)
|
||
|
|
print(f"Found {len(md_files)} markdown files")
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Extract and verify links
|
||
|
|
all_links = []
|
||
|
|
broken_links = []
|
||
|
|
stats = {
|
||
|
|
'total': 0,
|
||
|
|
'valid': 0,
|
||
|
|
'external': 0,
|
||
|
|
'broken': 0,
|
||
|
|
'invalid_anchor': 0
|
||
|
|
}
|
||
|
|
|
||
|
|
for md_file in md_files:
|
||
|
|
try:
|
||
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
||
|
|
content = f.read()
|
||
|
|
links = extract_links(content, md_file)
|
||
|
|
all_links.extend(links)
|
||
|
|
|
||
|
|
for link in links:
|
||
|
|
stats['total'] += 1
|
||
|
|
is_valid, link_type, error = verify_link(link, project_root)
|
||
|
|
|
||
|
|
if link_type == 'external':
|
||
|
|
stats['external'] += 1
|
||
|
|
stats['valid'] += 1
|
||
|
|
elif is_valid:
|
||
|
|
stats['valid'] += 1
|
||
|
|
else:
|
||
|
|
stats['broken'] += 1
|
||
|
|
if link_type == 'invalid_anchor':
|
||
|
|
stats['invalid_anchor'] += 1
|
||
|
|
broken_links.append({
|
||
|
|
'link': link,
|
||
|
|
'type': link_type,
|
||
|
|
'error': error
|
||
|
|
})
|
||
|
|
rel_file = os.path.relpath(link['source_file'], project_root)
|
||
|
|
print(f"{Colors.RED}✗{Colors.NC} {rel_file}:{link['line']} -> {link['path']}")
|
||
|
|
if error:
|
||
|
|
print(f" {Colors.YELLOW}Error:{Colors.NC} {error}")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"{Colors.RED}Error reading {md_file}:{Colors.NC} {e}")
|
||
|
|
|
||
|
|
# Generate report
|
||
|
|
success_rate = (stats['valid'] / stats['total'] * 100) if stats['total'] > 0 else 0
|
||
|
|
|
||
|
|
report_content = f"""# CROSS-REFERENCE VERIFICATION REPORT
|
||
|
|
## Automated Link Verification Results
|
||
|
|
|
||
|
|
**Generated:** {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")}
|
||
|
|
**Project Root:** {project_root}
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## SUMMARY
|
||
|
|
|
||
|
|
- **Total Links Scanned:** {stats['total']}
|
||
|
|
- **Valid Links:** {stats['valid']}
|
||
|
|
- **External Links:** {stats['external']}
|
||
|
|
- **Broken Links:** {stats['broken']}
|
||
|
|
- **Invalid Anchors:** {stats['invalid_anchor']}
|
||
|
|
- **Success Rate:** {success_rate:.2f}%
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## BROKEN LINKS
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
if broken_links:
|
||
|
|
for broken in broken_links:
|
||
|
|
link = broken['link']
|
||
|
|
rel_file = os.path.relpath(link['source_file'], project_root)
|
||
|
|
report_content += f"- **{rel_file}:{link['line']}** -> `{link['path']}`\n"
|
||
|
|
if broken['error']:
|
||
|
|
report_content += f" - Error: {broken['error']}\n"
|
||
|
|
report_content += "\n"
|
||
|
|
else:
|
||
|
|
report_content += "✅ No broken links found!\n\n"
|
||
|
|
|
||
|
|
report_content += """---
|
||
|
|
|
||
|
|
## RECOMMENDATIONS
|
||
|
|
|
||
|
|
1. Fix all broken links identified above
|
||
|
|
2. Verify and correct invalid anchors
|
||
|
|
3. Update cross-references in affected documents
|
||
|
|
4. Re-run verification after fixes
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
**END OF VERIFICATION REPORT**
|
||
|
|
"""
|
||
|
|
|
||
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
||
|
|
f.write(report_content)
|
||
|
|
|
||
|
|
# Print summary
|
||
|
|
print()
|
||
|
|
print("=" * 50)
|
||
|
|
print("Verification Summary")
|
||
|
|
print("=" * 50)
|
||
|
|
print()
|
||
|
|
print(f"{Colors.GREEN}Total Links Scanned:{Colors.NC} {stats['total']}")
|
||
|
|
print(f"{Colors.GREEN}Valid Links:{Colors.NC} {stats['valid']}")
|
||
|
|
print(f"{Colors.BLUE}External Links:{Colors.NC} {stats['external']}")
|
||
|
|
print(f"{Colors.RED}Broken Links:{Colors.NC} {stats['broken']}")
|
||
|
|
if stats['invalid_anchor'] > 0:
|
||
|
|
print(f"{Colors.YELLOW}Invalid Anchors:{Colors.NC} {stats['invalid_anchor']}")
|
||
|
|
print(f"{Colors.BLUE}Success Rate:{Colors.NC} {success_rate:.2f}%")
|
||
|
|
print()
|
||
|
|
print(f"{Colors.BLUE}Report generated:{Colors.NC} {report_file}")
|
||
|
|
print()
|
||
|
|
print(f"{Colors.GREEN}✓ Verification complete!{Colors.NC}")
|
||
|
|
|
||
|
|
return 0 if stats['broken'] == 0 else 1
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
sys.exit(main())
|
||
|
|
|