# Prometheus Alert Rules # Defines alerting conditions for The Order services groups: - name: service_health interval: 30s rules: - alert: ServiceDown expr: up{job=~"identity-service|intake-service|finance-service|dataroom-service|legal-documents-service"} == 0 for: 5m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "Service {{ $labels.job }} has been down for more than 5 minutes" - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate for {{ $labels.job }}" description: "Error rate is {{ $value }} errors per second" - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 10m labels: severity: warning annotations: summary: "High response time for {{ $labels.job }}" description: "95th percentile response time is {{ $value }} seconds" - name: infrastructure interval: 30s rules: - alert: HighCPUUsage expr: rate(process_cpu_user_seconds_total[5m]) > 0.8 for: 10m labels: severity: warning annotations: summary: "High CPU usage for {{ $labels.job }}" description: "CPU usage is {{ $value }}%" - alert: HighMemoryUsage expr: (process_resident_memory_bytes / process_virtual_memory_bytes) > 0.9 for: 10m labels: severity: warning annotations: summary: "High memory usage for {{ $labels.job }}" description: "Memory usage is {{ $value }}%" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1 for: 5m labels: severity: critical annotations: summary: "Low disk space on {{ $labels.instance }}" description: "Disk space is {{ $value }}% available" - name: database interval: 30s rules: - alert: DatabaseConnectionPoolExhausted expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8 for: 5m labels: severity: warning annotations: summary: "Database connection pool nearly exhausted" description: "{{ $value }}% of connections in use" - alert: SlowQueries expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1 for: 10m labels: severity: warning annotations: summary: "Slow database queries detected" description: "Average query time is {{ $value }} seconds" - name: azure interval: 30s rules: - alert: EntraAPIRateLimit expr: rate(entra_api_requests_total{status="429"}[5m]) > 0 for: 1m labels: severity: warning annotations: summary: "Entra API rate limit hit" description: "Rate limit errors detected for Entra VerifiedID API" - alert: AzureStorageErrors expr: rate(azure_storage_errors_total[5m]) > 0.01 for: 5m labels: severity: warning annotations: summary: "Azure Storage errors detected" description: "Storage error rate is {{ $value }} errors per second"