#!/usr/bin/env python3 """ DBIS Cross-Reference Verification Script Automated link verification and cross-reference checking """ import os import re import sys from pathlib import Path from datetime import datetime from collections import defaultdict # Colors for terminal output class Colors: RED = '\033[0;31m' GREEN = '\033[0;32m' YELLOW = '\033[1;33m' BLUE = '\033[0;34m' NC = '\033[0m' # No Color def find_markdown_files(root_dir): """Find all markdown files in the project.""" md_files = [] for root, dirs, files in os.walk(root_dir): # Skip certain directories dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__']] for file in files: if file.endswith('.md'): md_files.append(os.path.join(root, file)) return md_files def extract_links(content, file_path): """Extract all markdown links from content.""" links = [] # Pattern: [text](path) or [text](path#anchor) pattern = r'\[([^\]]+)\]\(([^)]+)\)' # Remove code blocks to avoid detecting links in examples # Split by code blocks (``` or `) code_block_pattern = r'```[^`]*```|`[^`]+`' content_without_code = re.sub(code_block_pattern, '', content) for match in re.finditer(pattern, content_without_code): link_text = match.group(1) link_path = match.group(2) # Skip example/placeholder links if any(placeholder in link_path for placeholder in ['relative/path/to/', 'document.md', 'path/to/directory/', 'path/to/file.md']): continue links.append({ 'text': link_text, 'path': link_path, 'line': content[:match.start()].count('\n') + 1, 'source_file': file_path }) return links def resolve_path(link_path, source_file): """Resolve relative path to absolute path.""" source_dir = os.path.dirname(source_file) project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Skip external links if link_path.startswith(('http://', 'https://', 'mailto:')): return None, 'external' # Handle anchor-only links if link_path.startswith('#'): anchor = link_path[1:] # Remove the # # Verify anchor exists in source file try: with open(source_file, 'r', encoding='utf-8') as f: content = f.read() # Check for HTML anchor html_pattern = rf' {link['path']}") if error: print(f" {Colors.YELLOW}Error:{Colors.NC} {error}") except Exception as e: print(f"{Colors.RED}Error reading {md_file}:{Colors.NC} {e}") # Generate report success_rate = (stats['valid'] / stats['total'] * 100) if stats['total'] > 0 else 0 report_content = f"""# CROSS-REFERENCE VERIFICATION REPORT ## Automated Link Verification Results **Generated:** {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")} **Project Root:** {project_root} --- ## SUMMARY - **Total Links Scanned:** {stats['total']} - **Valid Links:** {stats['valid']} - **External Links:** {stats['external']} - **Broken Links:** {stats['broken']} - **Invalid Anchors:** {stats['invalid_anchor']} - **Success Rate:** {success_rate:.2f}% --- ## BROKEN LINKS """ if broken_links: for broken in broken_links: link = broken['link'] rel_file = os.path.relpath(link['source_file'], project_root) report_content += f"- **{rel_file}:{link['line']}** -> `{link['path']}`\n" if broken['error']: report_content += f" - Error: {broken['error']}\n" report_content += "\n" else: report_content += "✅ No broken links found!\n\n" report_content += """--- ## RECOMMENDATIONS 1. Fix all broken links identified above 2. Verify and correct invalid anchors 3. Update cross-references in affected documents 4. Re-run verification after fixes --- **END OF VERIFICATION REPORT** """ with open(report_file, 'w', encoding='utf-8') as f: f.write(report_content) # Print summary print() print("=" * 50) print("Verification Summary") print("=" * 50) print() print(f"{Colors.GREEN}Total Links Scanned:{Colors.NC} {stats['total']}") print(f"{Colors.GREEN}Valid Links:{Colors.NC} {stats['valid']}") print(f"{Colors.BLUE}External Links:{Colors.NC} {stats['external']}") print(f"{Colors.RED}Broken Links:{Colors.NC} {stats['broken']}") if stats['invalid_anchor'] > 0: print(f"{Colors.YELLOW}Invalid Anchors:{Colors.NC} {stats['invalid_anchor']}") print(f"{Colors.BLUE}Success Rate:{Colors.NC} {success_rate:.2f}%") print() print(f"{Colors.BLUE}Report generated:{Colors.NC} {report_file}") print() print(f"{Colors.GREEN}✓ Verification complete!{Colors.NC}") return 0 if stats['broken'] == 0 else 1 if __name__ == '__main__': sys.exit(main())