apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: sankofa-alerts namespace: monitoring labels: app: sankofa prometheus: kube-prometheus role: alert-rules spec: groups: - name: api interval: 30s rules: # API High Error Rate - alert: APIHighErrorRate expr: | sum(rate(http_requests_total{job="api",status=~"5.."}[5m])) / sum(rate(http_requests_total{job="api"}[5m])) > 0.05 for: 5m labels: severity: critical annotations: summary: "API error rate is above 5%" description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes" # API High Latency - alert: APIHighLatency expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="api"}[5m])) by (le) ) > 0.5 for: 5m labels: severity: warning annotations: summary: "API p95 latency is above 500ms" description: "API p95 latency is {{ $value }}s" # API Down - alert: APIDown expr: up{job="api"} == 0 for: 1m labels: severity: critical annotations: summary: "API is down" description: "API service has been down for more than 1 minute" - name: portal interval: 30s rules: # Portal High Error Rate - alert: PortalHighErrorRate expr: | sum(rate(http_requests_total{job="portal",status=~"5.."}[5m])) / sum(rate(http_requests_total{job="portal"}[5m])) > 0.05 for: 5m labels: severity: critical annotations: summary: "Portal error rate is above 5%" description: "Portal error rate is {{ $value | humanizePercentage }}" # Portal Down - alert: PortalDown expr: up{job="portal"} == 0 for: 1m labels: severity: critical annotations: summary: "Portal is down" description: "Portal service has been down for more than 1 minute" - name: database interval: 30s rules: # Database High Connection Count - alert: DatabaseHighConnections expr: | pg_stat_database_numbackends{datname="sankofa"} > 80 for: 5m labels: severity: warning annotations: summary: "Database connection count is high" description: "Database has {{ $value }} active connections" # Database Slow Queries - alert: DatabaseSlowQueries expr: | pg_stat_activity_count{state="active"} > 10 for: 5m labels: severity: warning annotations: summary: "Database has slow queries" description: "Database has {{ $value }} active queries running for more than 5 minutes" # Database Down - alert: DatabaseDown expr: pg_up == 0 for: 1m labels: severity: critical annotations: summary: "Database is down" description: "PostgreSQL database is not responding" - name: keycloak interval: 30s rules: # Keycloak Down - alert: KeycloakDown expr: up{job="keycloak"} == 0 for: 1m labels: severity: critical annotations: summary: "Keycloak is down" description: "Keycloak authentication service is down" # Keycloak High Authentication Failures - alert: KeycloakHighAuthFailures expr: | sum(rate(keycloak_login_failures_total[5m])) > 10 for: 5m labels: severity: warning annotations: summary: "High authentication failure rate" description: "Keycloak has {{ $value }} authentication failures per second" - name: infrastructure interval: 30s rules: # High CPU Usage - alert: HighCPUUsage expr: | (1 - avg(rate(container_cpu_usage_seconds_total{container!="POD"}[5m]))) < 0.1 for: 10m labels: severity: warning annotations: summary: "High CPU usage" description: "CPU usage is above 90% for 10 minutes" # High Memory Usage - alert: HighMemoryUsage expr: | (1 - (container_memory_working_set_bytes{container!="POD"} / container_spec_memory_limit_bytes)) < 0.1 for: 10m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is above 90% for 10 minutes" # Pod CrashLooping - alert: PodCrashLooping expr: | rate(kube_pod_container_status_restarts_total[15m]) > 0 for: 5m labels: severity: warning annotations: summary: "Pod is crash looping" description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping" # Disk Space Low - alert: DiskSpaceLow expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1 for: 5m labels: severity: warning annotations: summary: "Disk space is low" description: "Disk space is below 10% on {{ $labels.instance }}" - name: backups interval: 1h rules: # Backup Failed - alert: BackupFailed expr: | time() - backup_last_success_timestamp > 86400 for: 1h labels: severity: critical annotations: summary: "Backup has not run in 24 hours" description: "Last successful backup was more than 24 hours ago" # Backup Too Old - alert: BackupTooOld expr: | time() - backup_last_success_timestamp > 172800 for: 1h labels: severity: critical annotations: summary: "Backup is more than 48 hours old" description: "Last successful backup was {{ $value }} seconds ago"