- Add comprehensive database migrations (001-024) for schema evolution - Enhance API schema with expanded type definitions and resolvers - Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth - Implement new services: AI optimization, billing, blockchain, compliance, marketplace - Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage) - Update Crossplane provider with enhanced VM management capabilities - Add comprehensive test suite for API endpoints and services - Update frontend components with improved GraphQL subscriptions and real-time updates - Enhance security configurations and headers (CSP, CORS, etc.) - Update documentation and configuration files - Add new CI/CD workflows and validation scripts - Implement design system improvements and UI enhancements
212 lines
5.5 KiB
Go
212 lines
5.5 KiB
Go
package gpu
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os/exec"
|
|
"strings"
|
|
)
|
|
|
|
// Manager manages GPU resources
|
|
type Manager struct {
|
|
// GPU tracking would be implemented here
|
|
gpuInventory map[string]GPUInfo
|
|
}
|
|
|
|
// GPUInfo represents GPU information
|
|
type GPUInfo struct {
|
|
ID string
|
|
Type string
|
|
Status string
|
|
VMID *int
|
|
Health string
|
|
Memory int64
|
|
Utilization float64
|
|
}
|
|
|
|
// NewManager creates a new GPU manager
|
|
func NewManager() *Manager {
|
|
return &Manager{
|
|
gpuInventory: make(map[string]GPUInfo),
|
|
}
|
|
}
|
|
|
|
// AllocateGPU allocates a GPU for a VM
|
|
func (m *Manager) AllocateGPU(ctx context.Context, vmID int, gpuType string) error {
|
|
// Find available GPU of the specified type
|
|
var availableGPU *GPUInfo
|
|
for id, gpu := range m.gpuInventory {
|
|
if gpu.Type == gpuType && gpu.Status == "AVAILABLE" && gpu.VMID == nil {
|
|
availableGPU = &gpu
|
|
availableGPU.ID = id
|
|
break
|
|
}
|
|
}
|
|
|
|
if availableGPU == nil {
|
|
return fmt.Errorf("no available GPU of type %s", gpuType)
|
|
}
|
|
|
|
// Allocate the GPU
|
|
availableGPU.VMID = &vmID
|
|
availableGPU.Status = "ALLOCATED"
|
|
m.gpuInventory[availableGPU.ID] = *availableGPU
|
|
|
|
// In a real implementation, this would:
|
|
// 1. Update Proxmox VM configuration to attach GPU
|
|
// 2. Use PCI passthrough or vGPU depending on GPU type
|
|
// 3. Update resource tracking
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetGPUHealth gets the health status of a GPU
|
|
func (m *Manager) GetGPUHealth(ctx context.Context, gpuID string) (string, error) {
|
|
// Check if GPU exists in inventory
|
|
gpu, exists := m.gpuInventory[gpuID]
|
|
if !exists {
|
|
// Try to discover GPU using nvidia-smi or similar tools
|
|
health, err := m.discoverGPUHealth(ctx, gpuID)
|
|
if err != nil {
|
|
return "UNKNOWN", err
|
|
}
|
|
return health, nil
|
|
}
|
|
|
|
// If GPU is allocated, check actual health via monitoring
|
|
if gpu.Status == "ALLOCATED" {
|
|
health, err := m.checkAllocatedGPUHealth(ctx, gpuID)
|
|
if err != nil {
|
|
return gpu.Health, err
|
|
}
|
|
|
|
// Update health in inventory
|
|
gpu.Health = health
|
|
m.gpuInventory[gpuID] = gpu
|
|
|
|
return health, nil
|
|
}
|
|
|
|
return gpu.Health, nil
|
|
}
|
|
|
|
// discoverGPUHealth discovers GPU health using system tools
|
|
func (m *Manager) discoverGPUHealth(ctx context.Context, gpuID string) (string, error) {
|
|
// Try nvidia-smi first (for NVIDIA GPUs)
|
|
cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=health", "--format=csv,noheader", "--id="+gpuID)
|
|
output, err := cmd.Output()
|
|
if err == nil {
|
|
health := strings.TrimSpace(string(output))
|
|
if health == "Ok" || health == "Healthy" {
|
|
return "HEALTHY", nil
|
|
}
|
|
return "DEGRADED", nil
|
|
}
|
|
|
|
// Try AMD GPU monitoring (rocm-smi)
|
|
cmd = exec.CommandContext(ctx, "rocm-smi", "--showtemp", "--id", gpuID)
|
|
output, err = cmd.Output()
|
|
if err == nil {
|
|
// Parse AMD GPU temperature
|
|
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
|
|
for _, line := range lines {
|
|
if strings.Contains(line, "Temperature") {
|
|
var temp int
|
|
if _, err := fmt.Sscanf(line, "%*s %d", &temp); err == nil {
|
|
const maxTemp = 95 // AMD typical max temp
|
|
if temp >= maxTemp {
|
|
return "DEGRADED", nil
|
|
}
|
|
return "HEALTHY", nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try Intel GPU monitoring (intel_gpu_top or similar)
|
|
// Note: Intel GPU monitoring varies by generation
|
|
|
|
return "UNKNOWN", fmt.Errorf("could not determine GPU health - no compatible monitoring tool found")
|
|
}
|
|
|
|
// checkAllocatedGPUHealth checks health of an allocated GPU
|
|
func (m *Manager) checkAllocatedGPUHealth(ctx context.Context, gpuID string) (string, error) {
|
|
// Check GPU utilization and temperature
|
|
cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=utilization.gpu,temperature.gpu", "--format=csv,noheader", "--id="+gpuID)
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
return "UNKNOWN", err
|
|
}
|
|
|
|
// Parse output
|
|
parts := strings.Split(strings.TrimSpace(string(output)), ",")
|
|
if len(parts) >= 2 {
|
|
// Extract utilization and temperature
|
|
_ = strings.TrimSpace(parts[0]) // utilStr - reserved for future use
|
|
tempStr := strings.TrimSpace(parts[1])
|
|
|
|
// Parse temperature (remove % and extract number)
|
|
tempParts := strings.Fields(tempStr)
|
|
if len(tempParts) > 0 {
|
|
var temp int
|
|
if _, err := fmt.Sscanf(tempParts[0], "%d", &temp); err == nil {
|
|
// Temperature thresholds
|
|
const maxTemp = 83 // NVIDIA default max temp
|
|
const warningTemp = 75
|
|
|
|
if temp >= maxTemp {
|
|
return "DEGRADED", nil
|
|
}
|
|
if temp >= warningTemp {
|
|
return "WARNING", nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return "HEALTHY", nil
|
|
}
|
|
|
|
return "UNKNOWN", fmt.Errorf("could not parse GPU metrics")
|
|
}
|
|
|
|
// ListGPUs lists all available GPUs
|
|
func (m *Manager) ListGPUs(ctx context.Context) ([]GPUInfo, error) {
|
|
// Discover GPUs using nvidia-smi or similar
|
|
cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader")
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
// If nvidia-smi is not available, return inventory
|
|
gpus := make([]GPUInfo, 0, len(m.gpuInventory))
|
|
for _, gpu := range m.gpuInventory {
|
|
gpus = append(gpus, gpu)
|
|
}
|
|
return gpus, nil
|
|
}
|
|
|
|
// Parse nvidia-smi output and update inventory
|
|
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
|
|
for _, line := range lines {
|
|
parts := strings.Split(line, ",")
|
|
if len(parts) >= 3 {
|
|
gpuID := strings.TrimSpace(parts[0])
|
|
gpuType := strings.TrimSpace(parts[1])
|
|
|
|
if _, exists := m.gpuInventory[gpuID]; !exists {
|
|
m.gpuInventory[gpuID] = GPUInfo{
|
|
ID: gpuID,
|
|
Type: gpuType,
|
|
Status: "AVAILABLE",
|
|
Health: "HEALTHY",
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
gpus := make([]GPUInfo, 0, len(m.gpuInventory))
|
|
for _, gpu := range m.gpuInventory {
|
|
gpus = append(gpus, gpu)
|
|
}
|
|
return gpus, nil
|
|
}
|
|
|