# Prometheus alerting rules for SolaceNet groups: - name: solacenet_capabilities interval: 30s rules: - alert: CapabilityDisabled expr: solacenet_capability_state{state="disabled"} > 0 for: 5m labels: severity: warning annotations: summary: "Capability {{ $labels.capability_id }} is disabled" description: "Capability {{ $labels.capability_id }} has been disabled for {{ $labels.tenant_id }}" - alert: KillSwitchActivated expr: increase(solacenet_kill_switch_activations_total[5m]) > 0 labels: severity: critical annotations: summary: "Kill switch activated for {{ $labels.capability_id }}" description: "Emergency kill switch was activated for capability {{ $labels.capability_id }}" - alert: HighPolicyDecisionLatency expr: histogram_quantile(0.95, solacenet_policy_decision_duration_seconds_bucket) > 1 for: 5m labels: severity: warning annotations: summary: "High policy decision latency" description: "95th percentile policy decision latency is {{ $value }}s" - name: solacenet_risk interval: 30s rules: - alert: HighRiskScore expr: solacenet_risk_score > 80 for: 2m labels: severity: warning annotations: summary: "High risk score detected" description: "Risk score of {{ $value }} detected for transaction {{ $labels.transaction_id }}" - alert: RiskEngineDown expr: up{job="risk-engine"} == 0 for: 1m labels: severity: critical annotations: summary: "Risk engine is down" description: "Risk rules engine is not responding" - name: solacenet_infrastructure interval: 30s rules: - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis cache is not available, policy decisions will not be cached" - alert: GatewayDown expr: up{job="solacenet-gateway"} == 0 for: 1m labels: severity: critical annotations: summary: "SolaceNet Gateway is down" description: "The SolaceNet API Gateway is not responding"