Apply Composer changes: comprehensive API updates, migrations, middleware, and infrastructure improvements
- Add comprehensive database migrations (001-024) for schema evolution - Enhance API schema with expanded type definitions and resolvers - Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth - Implement new services: AI optimization, billing, blockchain, compliance, marketplace - Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage) - Update Crossplane provider with enhanced VM management capabilities - Add comprehensive test suite for API endpoints and services - Update frontend components with improved GraphQL subscriptions and real-time updates - Enhance security configurations and headers (CSP, CORS, etc.) - Update documentation and configuration files - Add new CI/CD workflows and validation scripts - Implement design system improvements and UI enhancements
This commit is contained in:
207
gitops/apps/monitoring/alert-rules.yaml
Normal file
207
gitops/apps/monitoring/alert-rules.yaml
Normal file
@@ -0,0 +1,207 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: sankofa-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: sankofa
|
||||
prometheus: kube-prometheus
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: api
|
||||
interval: 30s
|
||||
rules:
|
||||
# API High Error Rate
|
||||
- alert: APIHighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_requests_total{job="api",status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="api"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API error rate is above 5%"
|
||||
description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
|
||||
|
||||
# API High Latency
|
||||
- alert: APIHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="api"}[5m])) by (le)
|
||||
) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API p95 latency is above 500ms"
|
||||
description: "API p95 latency is {{ $value }}s"
|
||||
|
||||
# API Down
|
||||
- alert: APIDown
|
||||
expr: up{job="api"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API is down"
|
||||
description: "API service has been down for more than 1 minute"
|
||||
|
||||
- name: portal
|
||||
interval: 30s
|
||||
rules:
|
||||
# Portal High Error Rate
|
||||
- alert: PortalHighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_requests_total{job="portal",status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total{job="portal"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Portal error rate is above 5%"
|
||||
description: "Portal error rate is {{ $value | humanizePercentage }}"
|
||||
|
||||
# Portal Down
|
||||
- alert: PortalDown
|
||||
expr: up{job="portal"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Portal is down"
|
||||
description: "Portal service has been down for more than 1 minute"
|
||||
|
||||
- name: database
|
||||
interval: 30s
|
||||
rules:
|
||||
# Database High Connection Count
|
||||
- alert: DatabaseHighConnections
|
||||
expr: |
|
||||
pg_stat_database_numbackends{datname="sankofa"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Database connection count is high"
|
||||
description: "Database has {{ $value }} active connections"
|
||||
|
||||
# Database Slow Queries
|
||||
- alert: DatabaseSlowQueries
|
||||
expr: |
|
||||
pg_stat_activity_count{state="active"} > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Database has slow queries"
|
||||
description: "Database has {{ $value }} active queries running for more than 5 minutes"
|
||||
|
||||
# Database Down
|
||||
- alert: DatabaseDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database is down"
|
||||
description: "PostgreSQL database is not responding"
|
||||
|
||||
- name: keycloak
|
||||
interval: 30s
|
||||
rules:
|
||||
# Keycloak Down
|
||||
- alert: KeycloakDown
|
||||
expr: up{job="keycloak"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Keycloak is down"
|
||||
description: "Keycloak authentication service is down"
|
||||
|
||||
# Keycloak High Authentication Failures
|
||||
- alert: KeycloakHighAuthFailures
|
||||
expr: |
|
||||
sum(rate(keycloak_login_failures_total[5m])) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High authentication failure rate"
|
||||
description: "Keycloak has {{ $value }} authentication failures per second"
|
||||
|
||||
- name: infrastructure
|
||||
interval: 30s
|
||||
rules:
|
||||
# High CPU Usage
|
||||
- alert: HighCPUUsage
|
||||
expr: |
|
||||
(1 - avg(rate(container_cpu_usage_seconds_total{container!="POD"}[5m]))) < 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage"
|
||||
description: "CPU usage is above 90% for 10 minutes"
|
||||
|
||||
# High Memory Usage
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
(1 - (container_memory_working_set_bytes{container!="POD"} / container_spec_memory_limit_bytes)) < 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage"
|
||||
description: "Memory usage is above 90% for 10 minutes"
|
||||
|
||||
# Pod CrashLooping
|
||||
- alert: PodCrashLooping
|
||||
expr: |
|
||||
rate(kube_pod_container_status_restarts_total[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod is crash looping"
|
||||
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping"
|
||||
|
||||
# Disk Space Low
|
||||
- alert: DiskSpaceLow
|
||||
expr: |
|
||||
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space is low"
|
||||
description: "Disk space is below 10% on {{ $labels.instance }}"
|
||||
|
||||
- name: backups
|
||||
interval: 1h
|
||||
rules:
|
||||
# Backup Failed
|
||||
- alert: BackupFailed
|
||||
expr: |
|
||||
time() - backup_last_success_timestamp > 86400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backup has not run in 24 hours"
|
||||
description: "Last successful backup was more than 24 hours ago"
|
||||
|
||||
# Backup Too Old
|
||||
- alert: BackupTooOld
|
||||
expr: |
|
||||
time() - backup_last_success_timestamp > 172800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backup is more than 48 hours old"
|
||||
description: "Last successful backup was {{ $value }} seconds ago"
|
||||
|
||||
@@ -40,7 +40,7 @@ spec:
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
hosts:
|
||||
- grafana.yourdomain.com
|
||||
- grafana.sankofa.nexus
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
|
||||
81
gitops/apps/monitoring/backup-cronjob.yaml
Normal file
81
gitops/apps/monitoring/backup-cronjob.yaml
Normal file
@@ -0,0 +1,81 @@
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: postgres-backup
|
||||
namespace: api
|
||||
spec:
|
||||
schedule: "0 2 * * *" # Daily at 2 AM
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: postgres-backup
|
||||
image: postgres:14-alpine
|
||||
command:
|
||||
- /bin/bash
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
BACKUP_DIR="/backups/postgres"
|
||||
DB_NAME="${DB_NAME:-sankofa}"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.sql"
|
||||
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
echo "Starting backup..."
|
||||
pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
|
||||
-F p -f "$BACKUP_FILE"
|
||||
|
||||
echo "Compressing backup..."
|
||||
gzip "$BACKUP_FILE"
|
||||
|
||||
echo "Cleaning up backups older than 7 days..."
|
||||
find "$BACKUP_DIR" -name "${DB_NAME}_*.sql.gz" -type f -mtime +7 -delete
|
||||
|
||||
echo "Backup completed: ${BACKUP_FILE}.gz"
|
||||
env:
|
||||
- name: DB_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: db-credentials
|
||||
key: host
|
||||
- name: DB_PORT
|
||||
value: "5432"
|
||||
- name: DB_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: db-credentials
|
||||
key: username
|
||||
- name: DB_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: db-credentials
|
||||
key: password
|
||||
- name: DB_NAME
|
||||
value: "sankofa"
|
||||
volumeMounts:
|
||||
- name: backup-storage
|
||||
mountPath: /backups
|
||||
restartPolicy: OnFailure
|
||||
volumes:
|
||||
- name: backup-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: postgres-backup-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: postgres-backup-pvc
|
||||
namespace: api
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi
|
||||
storageClassName: standard
|
||||
|
||||
35
gitops/apps/monitoring/grafana-dashboards.yaml
Normal file
35
gitops/apps/monitoring/grafana-dashboards.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
sankofa-overview.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Sankofa Phoenix Overview",
|
||||
"panels": [
|
||||
{
|
||||
"title": "API Request Rate",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(http_requests_total[5m])",
|
||||
"legendFormat": "{{method}} {{status}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Database Connections",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_database_numbackends",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
50
gitops/apps/monitoring/prometheus-config.yaml
Normal file
50
gitops/apps/monitoring/prometheus-config.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'sankofa-api'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- sankofa
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: sankofa-api
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4000
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
|
||||
- job_name: 'kubernetes-pods'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- action: keep
|
||||
regex: true
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
|
||||
- action: replace
|
||||
regex: (.+)
|
||||
source_labels:
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
||||
target_label: __metrics_path__
|
||||
- action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
||||
target_label: __address__
|
||||
|
||||
Reference in New Issue
Block a user