Apply Composer changes: comprehensive API updates, migrations, middleware, and infrastructure improvements

- Add comprehensive database migrations (001-024) for schema evolution - Enhance API schema with expanded type definitions and resolvers - Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth - Implement new services: AI optimization, billing, blockchain, compliance, marketplace - Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage) - Update Crossplane provider with enhanced VM management capabilities - Add comprehensive test suite for API endpoints and services - Update frontend components with improved GraphQL subscriptions and real-time updates - Enhance security configurations and headers (CSP, CORS, etc.) - Update documentation and configuration files - Add new CI/CD workflows and validation scripts - Implement design system improvements and UI enhancements
2025-12-12 18:01:35 -08:00
parent e01131efaf
commit 9daf1fd378
968 changed files with 160890 additions and 1092 deletions
--- a/gitops/apps/monitoring/alert-rules.yaml
+++ b/gitops/apps/monitoring/alert-rules.yaml
@@ -0,0 +1,207 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: sankofa-alerts
+  namespace: monitoring
+  labels:
+    app: sankofa
+    prometheus: kube-prometheus
+    role: alert-rules
+spec:
+  groups:
+    - name: api
+      interval: 30s
+      rules:
+        # API High Error Rate
+        - alert: APIHighErrorRate
+          expr: |
+            sum(rate(http_requests_total{job="api",status=~"5.."}[5m])) 
+            / 
+            sum(rate(http_requests_total{job="api"}[5m])) > 0.05
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "API error rate is above 5%"
+            description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
+
+        # API High Latency
+        - alert: APIHighLatency
+          expr: |
+            histogram_quantile(0.95, 
+              sum(rate(http_request_duration_seconds_bucket{job="api"}[5m])) by (le)
+            ) > 0.5
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "API p95 latency is above 500ms"
+            description: "API p95 latency is {{ $value }}s"
+
+        # API Down
+        - alert: APIDown
+          expr: up{job="api"} == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "API is down"
+            description: "API service has been down for more than 1 minute"
+
+    - name: portal
+      interval: 30s
+      rules:
+        # Portal High Error Rate
+        - alert: PortalHighErrorRate
+          expr: |
+            sum(rate(http_requests_total{job="portal",status=~"5.."}[5m])) 
+            / 
+            sum(rate(http_requests_total{job="portal"}[5m])) > 0.05
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Portal error rate is above 5%"
+            description: "Portal error rate is {{ $value | humanizePercentage }}"
+
+        # Portal Down
+        - alert: PortalDown
+          expr: up{job="portal"} == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Portal is down"
+            description: "Portal service has been down for more than 1 minute"
+
+    - name: database
+      interval: 30s
+      rules:
+        # Database High Connection Count
+        - alert: DatabaseHighConnections
+          expr: |
+            pg_stat_database_numbackends{datname="sankofa"} > 80
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Database connection count is high"
+            description: "Database has {{ $value }} active connections"
+
+        # Database Slow Queries
+        - alert: DatabaseSlowQueries
+          expr: |
+            pg_stat_activity_count{state="active"} > 10
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Database has slow queries"
+            description: "Database has {{ $value }} active queries running for more than 5 minutes"
+
+        # Database Down
+        - alert: DatabaseDown
+          expr: pg_up == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Database is down"
+            description: "PostgreSQL database is not responding"
+
+    - name: keycloak
+      interval: 30s
+      rules:
+        # Keycloak Down
+        - alert: KeycloakDown
+          expr: up{job="keycloak"} == 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Keycloak is down"
+            description: "Keycloak authentication service is down"
+
+        # Keycloak High Authentication Failures
+        - alert: KeycloakHighAuthFailures
+          expr: |
+            sum(rate(keycloak_login_failures_total[5m])) > 10
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High authentication failure rate"
+            description: "Keycloak has {{ $value }} authentication failures per second"
+
+    - name: infrastructure
+      interval: 30s
+      rules:
+        # High CPU Usage
+        - alert: HighCPUUsage
+          expr: |
+            (1 - avg(rate(container_cpu_usage_seconds_total{container!="POD"}[5m]))) < 0.1
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High CPU usage"
+            description: "CPU usage is above 90% for 10 minutes"
+
+        # High Memory Usage
+        - alert: HighMemoryUsage
+          expr: |
+            (1 - (container_memory_working_set_bytes{container!="POD"} / container_spec_memory_limit_bytes)) < 0.1
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "High memory usage"
+            description: "Memory usage is above 90% for 10 minutes"
+
+        # Pod CrashLooping
+        - alert: PodCrashLooping
+          expr: |
+            rate(kube_pod_container_status_restarts_total[15m]) > 0
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod is crash looping"
+            description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping"
+
+        # Disk Space Low
+        - alert: DiskSpaceLow
+          expr: |
+            (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Disk space is low"
+            description: "Disk space is below 10% on {{ $labels.instance }}"
+
+    - name: backups
+      interval: 1h
+      rules:
+        # Backup Failed
+        - alert: BackupFailed
+          expr: |
+            time() - backup_last_success_timestamp > 86400
+          for: 1h
+          labels:
+            severity: critical
+          annotations:
+            summary: "Backup has not run in 24 hours"
+            description: "Last successful backup was more than 24 hours ago"
+
+        # Backup Too Old
+        - alert: BackupTooOld
+          expr: |
+            time() - backup_last_success_timestamp > 172800
+          for: 1h
+          labels:
+            severity: critical
+          annotations:
+            summary: "Backup is more than 48 hours old"
+            description: "Last successful backup was {{ $value }} seconds ago"
+
--- a/gitops/apps/monitoring/application.yaml
+++ b/gitops/apps/monitoring/application.yaml
@@ -40,7 +40,7 @@ spec:
            annotations:
              cert-manager.io/cluster-issuer: letsencrypt-prod
            hosts:
-              - grafana.yourdomain.com
+              - grafana.sankofa.nexus
          persistence:
            enabled: true
            size: 10Gi
--- a/gitops/apps/monitoring/backup-cronjob.yaml
+++ b/gitops/apps/monitoring/backup-cronjob.yaml
@@ -0,0 +1,81 @@
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: postgres-backup
+  namespace: api
+spec:
+  schedule: "0 2 * * *"  # Daily at 2 AM
+  successfulJobsHistoryLimit: 3
+  failedJobsHistoryLimit: 3
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: postgres-backup
+            image: postgres:14-alpine
+            command:
+            - /bin/bash
+            - -c
+            - |
+              set -e
+              BACKUP_DIR="/backups/postgres"
+              DB_NAME="${DB_NAME:-sankofa}"
+              TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+              BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.sql"
+              
+              mkdir -p "$BACKUP_DIR"
+              
+              echo "Starting backup..."
+              pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
+                -F p -f "$BACKUP_FILE"
+              
+              echo "Compressing backup..."
+              gzip "$BACKUP_FILE"
+              
+              echo "Cleaning up backups older than 7 days..."
+              find "$BACKUP_DIR" -name "${DB_NAME}_*.sql.gz" -type f -mtime +7 -delete
+              
+              echo "Backup completed: ${BACKUP_FILE}.gz"
+            env:
+            - name: DB_HOST
+              valueFrom:
+                secretKeyRef:
+                  name: db-credentials
+                  key: host
+            - name: DB_PORT
+              value: "5432"
+            - name: DB_USER
+              valueFrom:
+                secretKeyRef:
+                  name: db-credentials
+                  key: username
+            - name: DB_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: db-credentials
+                  key: password
+            - name: DB_NAME
+              value: "sankofa"
+            volumeMounts:
+            - name: backup-storage
+              mountPath: /backups
+          restartPolicy: OnFailure
+          volumes:
+          - name: backup-storage
+            persistentVolumeClaim:
+              claimName: postgres-backup-pvc
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: postgres-backup-pvc
+  namespace: api
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 100Gi
+  storageClassName: standard
+
--- a/gitops/apps/monitoring/grafana-dashboards.yaml
+++ b/gitops/apps/monitoring/grafana-dashboards.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+data:
+  sankofa-overview.json: |
+    {
+      "dashboard": {
+        "title": "Sankofa Phoenix Overview",
+        "panels": [
+          {
+            "title": "API Request Rate",
+            "targets": [
+              {
+                "expr": "rate(http_requests_total[5m])",
+                "legendFormat": "{{method}} {{status}}"
+              }
+            ]
+          },
+          {
+            "title": "Database Connections",
+            "targets": [
+              {
+                "expr": "pg_stat_database_numbackends",
+                "legendFormat": "{{datname}}"
+              }
+            ]
+          }
+        ]
+      }
+    }
+
--- a/gitops/apps/monitoring/prometheus-config.yaml
+++ b/gitops/apps/monitoring/prometheus-config.yaml
@@ -0,0 +1,50 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: monitoring
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+    
+    scrape_configs:
+      - job_name: 'sankofa-api'
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names:
+                - sankofa
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app]
+            action: keep
+            regex: sankofa-api
+          - source_labels: [__meta_kubernetes_pod_ip]
+            action: replace
+            target_label: __address__
+            replacement: $1:4000
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+      
+      - job_name: 'kubernetes-pods'
+        kubernetes_sd_configs:
+          - role: pod
+        relabel_configs:
+          - action: keep
+            regex: true
+            source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_scrape
+          - action: replace
+            regex: (.+)
+            source_labels:
+              - __meta_kubernetes_pod_annotation_prometheus_io_path
+            target_label: __metrics_path__
+          - action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            source_labels:
+              - __address__
+              - __meta_kubernetes_pod_annotation_prometheus_io_port
+            target_label: __address__
+