Files
the_order/infra/monitoring/alert-rules.yml
defiQUG 3d43155312 feat: expand test coverage and configure comprehensive alerting
- Add unit tests for all core services (identity, intake, finance, dataroom)
- Create integration test framework with shared setup utilities
- Add E2E test suite for complete user workflows
- Add test utilities package (server factory)
- Configure Prometheus alert rules (service health, infrastructure, database, Azure)
- Add alert rules ConfigMap for Kubernetes
- Update Prometheus deployment with alert rules
- Fix tsconfig.json to include test files
- Add tests/tsconfig.json for integration/E2E tests
- Fix server-factory.ts linting issues
2025-11-13 10:04:32 -08:00

106 lines
3.4 KiB
YAML

# Prometheus Alert Rules
# Defines alerting conditions for The Order services
groups:
- name: service_health
interval: 30s
rules:
- alert: ServiceDown
expr: up{job=~"identity-service|intake-service|finance-service|dataroom-service|legal-documents-service"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate for {{ $labels.job }}"
description: "Error rate is {{ $value }} errors per second"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High response time for {{ $labels.job }}"
description: "95th percentile response time is {{ $value }} seconds"
- name: infrastructure
interval: 30s
rules:
- alert: HighCPUUsage
expr: rate(process_cpu_user_seconds_total[5m]) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage for {{ $labels.job }}"
description: "CPU usage is {{ $value }}%"
- alert: HighMemoryUsage
expr: (process_resident_memory_bytes / process_virtual_memory_bytes) > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage for {{ $labels.job }}"
description: "Memory usage is {{ $value }}%"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk space is {{ $value }}% available"
- name: database
interval: 30s
rules:
- alert: DatabaseConnectionPoolExhausted
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool nearly exhausted"
description: "{{ $value }}% of connections in use"
- alert: SlowQueries
expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "Slow database queries detected"
description: "Average query time is {{ $value }} seconds"
- name: azure
interval: 30s
rules:
- alert: EntraAPIRateLimit
expr: rate(entra_api_requests_total{status="429"}[5m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Entra API rate limit hit"
description: "Rate limit errors detected for Entra VerifiedID API"
- alert: AzureStorageErrors
expr: rate(azure_storage_errors_total[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Azure Storage errors detected"
description: "Storage error rate is {{ $value }} errors per second"