Files
dbis_core/monitoring/as4-alerts.yml

86 lines
2.6 KiB
YAML
Raw Permalink Normal View History

# Prometheus Alerting Rules for AS4 Settlement
groups:
- name: as4_settlement
interval: 30s
rules:
# High Latency Alert
- alert: AS4HighLatency
expr: as4_message_latency_p99 > 5
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 message processing latency is high"
description: "P99 latency is {{ $value }}s (threshold: 5s)"
# High Failure Rate Alert
- alert: AS4HighFailureRate
expr: rate(as4_instructions_failed[5m]) > 0.01
for: 5m
labels:
severity: critical
annotations:
summary: "AS4 instruction failure rate is high"
description: "Failure rate is {{ $value }} (threshold: 1%)"
# Certificate Expiring Alert
- alert: AS4CertificateExpiring
expr: as4_certificate_days_until_expiry < 30
for: 1h
labels:
severity: warning
annotations:
summary: "AS4 certificate expiring soon"
description: "Certificate expires in {{ $value }} days"
# System Unavailable Alert
- alert: AS4SystemUnavailable
expr: up{job="as4-settlement"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AS4 Settlement system is down"
description: "AS4 service is not responding"
# Database Connection Alert
- alert: AS4DatabaseConnectionFailed
expr: as4_database_connection_status == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AS4 database connection failed"
description: "Cannot connect to database"
# Redis Connection Alert
- alert: AS4RedisConnectionFailed
expr: as4_redis_connection_status == 0
for: 1m
labels:
severity: warning
annotations:
summary: "AS4 Redis connection failed"
description: "Cannot connect to Redis (nonce tracking may be affected)"
# High Memory Usage Alert
- alert: AS4HighMemoryUsage
expr: as4_memory_usage_percent > 80
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 system memory usage is high"
description: "Memory usage is {{ $value }}%"
# Queue Backlog Alert
- alert: AS4QueueBacklog
expr: as4_instruction_queue_length > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "AS4 instruction queue backlog"
description: "Queue length is {{ $value }} instructions"