#!/usr/bin/env python3 """ DBIS Cross-Reference Verification Script Automated link verification and cross-reference checking """ import os import re import sys from pathlib import Path from datetime import datetime from collections import defaultdict # Colors for terminal output class Colors: RED = '\033[0;31m' GREEN = '\033[0;32m' YELLOW = '\033[1;33m' BLUE = '\033[0;34m' NC = '\033[0m' # No Color def find_markdown_files(root_dir): """Find all markdown files in the project.""" md_files = [] for root, dirs, files in os.walk(root_dir): # Skip certain directories dirs[:] = [d for d in dirs if d not in ['.git', 'node_modules', '__pycache__']] for file in files: if file.endswith('.md'): md_files.append(os.path.join(root, file)) return md_files def extract_links(content, file_path): """Extract all markdown links from content.""" links = [] # Pattern: [text](path) or [text](path#anchor) pattern = r'\[([^\]]+)\]\(([^)]+)\)' for match in re.finditer(pattern, content): link_text = match.group(1) link_path = match.group(2) links.append({ 'text': link_text, 'path': link_path, 'line': content[:match.start()].count('\n') + 1, 'source_file': file_path }) return links def resolve_path(link_path, source_file): """Resolve relative path to absolute path.""" source_dir = os.path.dirname(source_file) project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Skip external links if link_path.startswith(('http://', 'https://', 'mailto:')): return None, 'external' # Handle anchor-only links if link_path.startswith('#'): return source_file, 'anchor' # Split path and anchor if '#' in link_path: file_part, anchor = link_path.split('#', 1) else: file_part = link_path anchor = None # Resolve file path if file_part.startswith('/'): # Absolute path from project root full_path = os.path.join(project_root, file_part.lstrip('/')) elif file_part.startswith('../'): # Relative path going up - resolve relative to source directory full_path = os.path.normpath(os.path.join(source_dir, file_part)) # Ensure it's still within project root if not os.path.commonpath([project_root, full_path]) == project_root: # Path went outside project root, try from project root # Remove ../ and resolve from project root rel_path = file_part while rel_path.startswith('../'): rel_path = rel_path[3:] full_path = os.path.join(project_root, rel_path) else: # Relative path in same directory or subdirectory full_path = os.path.normpath(os.path.join(source_dir, file_part)) return full_path, anchor def verify_link(link, project_root): """Verify if a link is valid.""" link_path = link['path'] source_file = link['source_file'] resolved_path, anchor = resolve_path(link_path, source_file) if resolved_path is None: return True, 'external', None # External links are considered valid if resolved_path == 'anchor': return True, 'anchor', None # Anchor-only links are valid # Check if file exists if os.path.isfile(resolved_path): # If anchor exists, check if it's in the file (simplified check) if anchor: try: with open(resolved_path, 'r', encoding='utf-8') as f: content = f.read() # Check for anchor in headings anchor_pattern = rf'#+\s+.*{re.escape(anchor)}' if re.search(anchor_pattern, content, re.IGNORECASE): return True, 'valid', None else: return False, 'invalid_anchor', f"Anchor '{anchor}' not found" except Exception as e: return False, 'error', str(e) return True, 'valid', None else: return False, 'missing_file', f"File not found: {resolved_path}" def main(): """Main verification function.""" script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(script_dir) report_file = os.path.join(project_root, 'CROSS_REFERENCE_VERIFICATION_REPORT.md') print("=" * 50) print("DBIS Cross-Reference Verification") print("=" * 50) print() print(f"{Colors.BLUE}Project Root:{Colors.NC} {project_root}") print() # Find all markdown files print(f"{Colors.BLUE}Scanning markdown files...{Colors.NC}") md_files = find_markdown_files(project_root) print(f"Found {len(md_files)} markdown files") print() # Extract and verify links all_links = [] broken_links = [] stats = { 'total': 0, 'valid': 0, 'external': 0, 'broken': 0, 'invalid_anchor': 0 } for md_file in md_files: try: with open(md_file, 'r', encoding='utf-8') as f: content = f.read() links = extract_links(content, md_file) all_links.extend(links) for link in links: stats['total'] += 1 is_valid, link_type, error = verify_link(link, project_root) if link_type == 'external': stats['external'] += 1 stats['valid'] += 1 elif is_valid: stats['valid'] += 1 else: stats['broken'] += 1 if link_type == 'invalid_anchor': stats['invalid_anchor'] += 1 broken_links.append({ 'link': link, 'type': link_type, 'error': error }) rel_file = os.path.relpath(link['source_file'], project_root) print(f"{Colors.RED}✗{Colors.NC} {rel_file}:{link['line']} -> {link['path']}") if error: print(f" {Colors.YELLOW}Error:{Colors.NC} {error}") except Exception as e: print(f"{Colors.RED}Error reading {md_file}:{Colors.NC} {e}") # Generate report success_rate = (stats['valid'] / stats['total'] * 100) if stats['total'] > 0 else 0 report_content = f"""# CROSS-REFERENCE VERIFICATION REPORT ## Automated Link Verification Results **Generated:** {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")} **Project Root:** {project_root} --- ## SUMMARY - **Total Links Scanned:** {stats['total']} - **Valid Links:** {stats['valid']} - **External Links:** {stats['external']} - **Broken Links:** {stats['broken']} - **Invalid Anchors:** {stats['invalid_anchor']} - **Success Rate:** {success_rate:.2f}% --- ## BROKEN LINKS """ if broken_links: for broken in broken_links: link = broken['link'] rel_file = os.path.relpath(link['source_file'], project_root) report_content += f"- **{rel_file}:{link['line']}** -> `{link['path']}`\n" if broken['error']: report_content += f" - Error: {broken['error']}\n" report_content += "\n" else: report_content += "✅ No broken links found!\n\n" report_content += """--- ## RECOMMENDATIONS 1. Fix all broken links identified above 2. Verify and correct invalid anchors 3. Update cross-references in affected documents 4. Re-run verification after fixes --- **END OF VERIFICATION REPORT** """ with open(report_file, 'w', encoding='utf-8') as f: f.write(report_content) # Print summary print() print("=" * 50) print("Verification Summary") print("=" * 50) print() print(f"{Colors.GREEN}Total Links Scanned:{Colors.NC} {stats['total']}") print(f"{Colors.GREEN}Valid Links:{Colors.NC} {stats['valid']}") print(f"{Colors.BLUE}External Links:{Colors.NC} {stats['external']}") print(f"{Colors.RED}Broken Links:{Colors.NC} {stats['broken']}") if stats['invalid_anchor'] > 0: print(f"{Colors.YELLOW}Invalid Anchors:{Colors.NC} {stats['invalid_anchor']}") print(f"{Colors.BLUE}Success Rate:{Colors.NC} {success_rate:.2f}%") print() print(f"{Colors.BLUE}Report generated:{Colors.NC} {report_file}") print() print(f"{Colors.GREEN}✓ Verification complete!{Colors.NC}") return 0 if stats['broken'] == 0 else 1 if __name__ == '__main__': sys.exit(main())