Files
dbis_docs/scripts/verify_links.py

263 lines
8.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DBIS Cross-Reference Verification Script
Automated link verification and cross-reference checking
"""
import os
import re
import sys
from pathlib import Path
from datetime import datetime
from collections import defaultdict
# Colors for terminal output
class Colors:
RED = '\033[0;31m'
GREEN = '\033[0;32m'
YELLOW = '\033[1;33m'
BLUE = '\033[0;34m'
NC = '\033[0m' # No Color
def find_markdown_files(root_dir):
"""Find all markdown files in the project."""
md_files = []
for root, dirs, files in os.walk(root_dir):
# Skip certain directories
dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__']]
for file in files:
if file.endswith('.md'):
md_files.append(os.path.join(root, file))
return md_files
def extract_links(content, file_path):
"""Extract all markdown links from content."""
links = []
# Pattern: [text](path) or [text](path#anchor)
pattern = r'\[([^\]]+)\]\(([^)]+)\)'
for match in re.finditer(pattern, content):
link_text = match.group(1)
link_path = match.group(2)
links.append({
'text': link_text,
'path': link_path,
'line': content[:match.start()].count('\n') + 1,
'source_file': file_path
})
return links
def resolve_path(link_path, source_file):
"""Resolve relative path to absolute path."""
source_dir = os.path.dirname(source_file)
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Skip external links
if link_path.startswith(('http://', 'https://', 'mailto:')):
return None, 'external'
# Handle anchor-only links
if link_path.startswith('#'):
return source_file, 'anchor'
# Split path and anchor
if '#' in link_path:
file_part, anchor = link_path.split('#', 1)
else:
file_part = link_path
anchor = None
# Resolve file path
if file_part.startswith('/'):
# Absolute path from project root
full_path = os.path.join(project_root, file_part.lstrip('/'))
elif file_part.startswith('../'):
# Relative path going up - resolve relative to source directory
full_path = os.path.normpath(os.path.join(source_dir, file_part))
# Ensure it's still within project root
if not os.path.commonpath([project_root, full_path]) == project_root:
# Path went outside project root, try from project root
# Remove ../ and resolve from project root
rel_path = file_part
while rel_path.startswith('../'):
rel_path = rel_path[3:]
full_path = os.path.join(project_root, rel_path)
else:
# Relative path in same directory or subdirectory
full_path = os.path.normpath(os.path.join(source_dir, file_part))
return full_path, anchor
def verify_link(link, project_root):
"""Verify if a link is valid."""
link_path = link['path']
source_file = link['source_file']
resolved_path, anchor = resolve_path(link_path, source_file)
if resolved_path is None:
return True, 'external', None # External links are considered valid
if resolved_path == 'anchor':
return True, 'anchor', None # Anchor-only links are valid
# Check if file exists
if os.path.isfile(resolved_path):
# If anchor exists, check if it's in the file (simplified check)
if anchor:
try:
with open(resolved_path, 'r', encoding='utf-8') as f:
content = f.read()
# Check for anchor in headings
anchor_pattern = rf'#+\s+.*{re.escape(anchor)}'
if re.search(anchor_pattern, content, re.IGNORECASE):
return True, 'valid', None
else:
return False, 'invalid_anchor', f"Anchor '{anchor}' not found"
except Exception as e:
return False, 'error', str(e)
return True, 'valid', None
else:
return False, 'missing_file', f"File not found: {resolved_path}"
def main():
"""Main verification function."""
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(script_dir)
report_file = os.path.join(project_root, 'CROSS_REFERENCE_VERIFICATION_REPORT.md')
print("=" * 50)
print("DBIS Cross-Reference Verification")
print("=" * 50)
print()
print(f"{Colors.BLUE}Project Root:{Colors.NC} {project_root}")
print()
# Find all markdown files
print(f"{Colors.BLUE}Scanning markdown files...{Colors.NC}")
md_files = find_markdown_files(project_root)
print(f"Found {len(md_files)} markdown files")
print()
# Extract and verify links
all_links = []
broken_links = []
stats = {
'total': 0,
'valid': 0,
'external': 0,
'broken': 0,
'invalid_anchor': 0
}
for md_file in md_files:
try:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
links = extract_links(content, md_file)
all_links.extend(links)
for link in links:
stats['total'] += 1
is_valid, link_type, error = verify_link(link, project_root)
if link_type == 'external':
stats['external'] += 1
stats['valid'] += 1
elif is_valid:
stats['valid'] += 1
else:
stats['broken'] += 1
if link_type == 'invalid_anchor':
stats['invalid_anchor'] += 1
broken_links.append({
'link': link,
'type': link_type,
'error': error
})
rel_file = os.path.relpath(link['source_file'], project_root)
print(f"{Colors.RED}{Colors.NC} {rel_file}:{link['line']} -> {link['path']}")
if error:
print(f" {Colors.YELLOW}Error:{Colors.NC} {error}")
except Exception as e:
print(f"{Colors.RED}Error reading {md_file}:{Colors.NC} {e}")
# Generate report
success_rate = (stats['valid'] / stats['total'] * 100) if stats['total'] > 0 else 0
report_content = f"""# CROSS-REFERENCE VERIFICATION REPORT
## Automated Link Verification Results
**Generated:** {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")}
**Project Root:** {project_root}
---
## SUMMARY
- **Total Links Scanned:** {stats['total']}
- **Valid Links:** {stats['valid']}
- **External Links:** {stats['external']}
- **Broken Links:** {stats['broken']}
- **Invalid Anchors:** {stats['invalid_anchor']}
- **Success Rate:** {success_rate:.2f}%
---
## BROKEN LINKS
"""
if broken_links:
for broken in broken_links:
link = broken['link']
rel_file = os.path.relpath(link['source_file'], project_root)
report_content += f"- **{rel_file}:{link['line']}** -> `{link['path']}`\n"
if broken['error']:
report_content += f" - Error: {broken['error']}\n"
report_content += "\n"
else:
report_content += "✅ No broken links found!\n\n"
report_content += """---
## RECOMMENDATIONS
1. Fix all broken links identified above
2. Verify and correct invalid anchors
3. Update cross-references in affected documents
4. Re-run verification after fixes
---
**END OF VERIFICATION REPORT**
"""
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report_content)
# Print summary
print()
print("=" * 50)
print("Verification Summary")
print("=" * 50)
print()
print(f"{Colors.GREEN}Total Links Scanned:{Colors.NC} {stats['total']}")
print(f"{Colors.GREEN}Valid Links:{Colors.NC} {stats['valid']}")
print(f"{Colors.BLUE}External Links:{Colors.NC} {stats['external']}")
print(f"{Colors.RED}Broken Links:{Colors.NC} {stats['broken']}")
if stats['invalid_anchor'] > 0:
print(f"{Colors.YELLOW}Invalid Anchors:{Colors.NC} {stats['invalid_anchor']}")
print(f"{Colors.BLUE}Success Rate:{Colors.NC} {success_rate:.2f}%")
print()
print(f"{Colors.BLUE}Report generated:{Colors.NC} {report_file}")
print()
print(f"{Colors.GREEN}✓ Verification complete!{Colors.NC}")
return 0 if stats['broken'] == 0 else 1
if __name__ == '__main__':
sys.exit(main())