Files

74 lines
2.3 KiB
YAML
Raw Permalink Normal View History

# Prometheus alerting rules for SolaceNet
groups:
- name: solacenet_capabilities
interval: 30s
rules:
- alert: CapabilityDisabled
expr: solacenet_capability_state{state="disabled"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Capability {{ $labels.capability_id }} is disabled"
description: "Capability {{ $labels.capability_id }} has been disabled for {{ $labels.tenant_id }}"
- alert: KillSwitchActivated
expr: increase(solacenet_kill_switch_activations_total[5m]) > 0
labels:
severity: critical
annotations:
summary: "Kill switch activated for {{ $labels.capability_id }}"
description: "Emergency kill switch was activated for capability {{ $labels.capability_id }}"
- alert: HighPolicyDecisionLatency
expr: histogram_quantile(0.95, solacenet_policy_decision_duration_seconds_bucket) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High policy decision latency"
description: "95th percentile policy decision latency is {{ $value }}s"
- name: solacenet_risk
interval: 30s
rules:
- alert: HighRiskScore
expr: solacenet_risk_score > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High risk score detected"
description: "Risk score of {{ $value }} detected for transaction {{ $labels.transaction_id }}"
- alert: RiskEngineDown
expr: up{job="risk-engine"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Risk engine is down"
description: "Risk rules engine is not responding"
- name: solacenet_infrastructure
interval: 30s
rules:
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis cache is not available, policy decisions will not be cached"
- alert: GatewayDown
expr: up{job="solacenet-gateway"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "SolaceNet Gateway is down"
description: "The SolaceNet API Gateway is not responding"