Apply Composer changes: comprehensive API updates, migrations, middleware, and infrastructure improvements

- Add comprehensive database migrations (001-024) for schema evolution
- Enhance API schema with expanded type definitions and resolvers
- Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth
- Implement new services: AI optimization, billing, blockchain, compliance, marketplace
- Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage)
- Update Crossplane provider with enhanced VM management capabilities
- Add comprehensive test suite for API endpoints and services
- Update frontend components with improved GraphQL subscriptions and real-time updates
- Enhance security configurations and headers (CSP, CORS, etc.)
- Update documentation and configuration files
- Add new CI/CD workflows and validation scripts
- Implement design system improvements and UI enhancements
This commit is contained in:
defiQUG
2025-12-12 18:01:35 -08:00
parent e01131efaf
commit 9daf1fd378
968 changed files with 160890 additions and 1092 deletions

View File

@@ -0,0 +1,207 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: sankofa-alerts
namespace: monitoring
labels:
app: sankofa
prometheus: kube-prometheus
role: alert-rules
spec:
groups:
- name: api
interval: 30s
rules:
# API High Error Rate
- alert: APIHighErrorRate
expr: |
sum(rate(http_requests_total{job="api",status=~"5.."}[5m]))
/
sum(rate(http_requests_total{job="api"}[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "API error rate is above 5%"
description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
# API High Latency
- alert: APIHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="api"}[5m])) by (le)
) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "API p95 latency is above 500ms"
description: "API p95 latency is {{ $value }}s"
# API Down
- alert: APIDown
expr: up{job="api"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "API is down"
description: "API service has been down for more than 1 minute"
- name: portal
interval: 30s
rules:
# Portal High Error Rate
- alert: PortalHighErrorRate
expr: |
sum(rate(http_requests_total{job="portal",status=~"5.."}[5m]))
/
sum(rate(http_requests_total{job="portal"}[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "Portal error rate is above 5%"
description: "Portal error rate is {{ $value | humanizePercentage }}"
# Portal Down
- alert: PortalDown
expr: up{job="portal"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Portal is down"
description: "Portal service has been down for more than 1 minute"
- name: database
interval: 30s
rules:
# Database High Connection Count
- alert: DatabaseHighConnections
expr: |
pg_stat_database_numbackends{datname="sankofa"} > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection count is high"
description: "Database has {{ $value }} active connections"
# Database Slow Queries
- alert: DatabaseSlowQueries
expr: |
pg_stat_activity_count{state="active"} > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Database has slow queries"
description: "Database has {{ $value }} active queries running for more than 5 minutes"
# Database Down
- alert: DatabaseDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Database is down"
description: "PostgreSQL database is not responding"
- name: keycloak
interval: 30s
rules:
# Keycloak Down
- alert: KeycloakDown
expr: up{job="keycloak"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Keycloak is down"
description: "Keycloak authentication service is down"
# Keycloak High Authentication Failures
- alert: KeycloakHighAuthFailures
expr: |
sum(rate(keycloak_login_failures_total[5m])) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High authentication failure rate"
description: "Keycloak has {{ $value }} authentication failures per second"
- name: infrastructure
interval: 30s
rules:
# High CPU Usage
- alert: HighCPUUsage
expr: |
(1 - avg(rate(container_cpu_usage_seconds_total{container!="POD"}[5m]))) < 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is above 90% for 10 minutes"
# High Memory Usage
- alert: HighMemoryUsage
expr: |
(1 - (container_memory_working_set_bytes{container!="POD"} / container_spec_memory_limit_bytes)) < 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is above 90% for 10 minutes"
# Pod CrashLooping
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod is crash looping"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping"
# Disk Space Low
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space is low"
description: "Disk space is below 10% on {{ $labels.instance }}"
- name: backups
interval: 1h
rules:
# Backup Failed
- alert: BackupFailed
expr: |
time() - backup_last_success_timestamp > 86400
for: 1h
labels:
severity: critical
annotations:
summary: "Backup has not run in 24 hours"
description: "Last successful backup was more than 24 hours ago"
# Backup Too Old
- alert: BackupTooOld
expr: |
time() - backup_last_success_timestamp > 172800
for: 1h
labels:
severity: critical
annotations:
summary: "Backup is more than 48 hours old"
description: "Last successful backup was {{ $value }} seconds ago"

View File

@@ -40,7 +40,7 @@ spec:
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- grafana.yourdomain.com
- grafana.sankofa.nexus
persistence:
enabled: true
size: 10Gi

View File

@@ -0,0 +1,81 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: postgres-backup
namespace: api
spec:
schedule: "0 2 * * *" # Daily at 2 AM
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
containers:
- name: postgres-backup
image: postgres:14-alpine
command:
- /bin/bash
- -c
- |
set -e
BACKUP_DIR="/backups/postgres"
DB_NAME="${DB_NAME:-sankofa}"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${TIMESTAMP}.sql"
mkdir -p "$BACKUP_DIR"
echo "Starting backup..."
pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
-F p -f "$BACKUP_FILE"
echo "Compressing backup..."
gzip "$BACKUP_FILE"
echo "Cleaning up backups older than 7 days..."
find "$BACKUP_DIR" -name "${DB_NAME}_*.sql.gz" -type f -mtime +7 -delete
echo "Backup completed: ${BACKUP_FILE}.gz"
env:
- name: DB_HOST
valueFrom:
secretKeyRef:
name: db-credentials
key: host
- name: DB_PORT
value: "5432"
- name: DB_USER
valueFrom:
secretKeyRef:
name: db-credentials
key: username
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: db-credentials
key: password
- name: DB_NAME
value: "sankofa"
volumeMounts:
- name: backup-storage
mountPath: /backups
restartPolicy: OnFailure
volumes:
- name: backup-storage
persistentVolumeClaim:
claimName: postgres-backup-pvc
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgres-backup-pvc
namespace: api
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: standard

View File

@@ -0,0 +1,35 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
sankofa-overview.json: |
{
"dashboard": {
"title": "Sankofa Phoenix Overview",
"panels": [
{
"title": "API Request Rate",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{status}}"
}
]
},
{
"title": "Database Connections",
"targets": [
{
"expr": "pg_stat_database_numbackends",
"legendFormat": "{{datname}}"
}
]
}
]
}
}

View File

@@ -0,0 +1,50 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'sankofa-api'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- sankofa
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: sankofa-api
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4000
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__