Sankofa/crossplane-provider-proxmox/pkg/gpu/manager.go

package gpu

import (
	"context"
	"fmt"
	"os/exec"
	"strings"
)

// Manager manages GPU resources
type Manager struct {
	// GPU tracking would be implemented here
	gpuInventory map[string]GPUInfo
}

// GPUInfo represents GPU information
type GPUInfo struct {
	ID       string
	Type     string
	Status   string
	VMID     *int
	Health   string
	Memory   int64
	Utilization float64
}

// NewManager creates a new GPU manager
func NewManager() *Manager {
	return &Manager{
		gpuInventory: make(map[string]GPUInfo),
	}
}

// AllocateGPU allocates a GPU for a VM
func (m *Manager) AllocateGPU(ctx context.Context, vmID int, gpuType string) error {
	// Find available GPU of the specified type
	var availableGPU *GPUInfo
	for id, gpu := range m.gpuInventory {
		if gpu.Type == gpuType && gpu.Status == "AVAILABLE" && gpu.VMID == nil {
			availableGPU = &gpu
			availableGPU.ID = id
			break
		}
	}

	if availableGPU == nil {
		return fmt.Errorf("no available GPU of type %s", gpuType)
	}

	// Allocate the GPU
	availableGPU.VMID = &vmID
	availableGPU.Status = "ALLOCATED"
	m.gpuInventory[availableGPU.ID] = *availableGPU

	// In a real implementation, this would:
	// 1. Update Proxmox VM configuration to attach GPU
	// 2. Use PCI passthrough or vGPU depending on GPU type
	// 3. Update resource tracking

	return nil
}

// GetGPUHealth gets the health status of a GPU
func (m *Manager) GetGPUHealth(ctx context.Context, gpuID string) (string, error) {
	// Check if GPU exists in inventory
	gpu, exists := m.gpuInventory[gpuID]
	if !exists {
		// Try to discover GPU using nvidia-smi or similar tools
		health, err := m.discoverGPUHealth(ctx, gpuID)
		if err != nil {
			return "UNKNOWN", err
		}
		return health, nil
	}

	// If GPU is allocated, check actual health via monitoring
	if gpu.Status == "ALLOCATED" {
		health, err := m.checkAllocatedGPUHealth(ctx, gpuID)
		if err != nil {
			return gpu.Health, err
		}

		// Update health in inventory
		gpu.Health = health
		m.gpuInventory[gpuID] = gpu

		return health, nil
	}

	return gpu.Health, nil
}

// discoverGPUHealth discovers GPU health using system tools
func (m *Manager) discoverGPUHealth(ctx context.Context, gpuID string) (string, error) {
	// Try nvidia-smi first (for NVIDIA GPUs)
	cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=health", "--format=csv,noheader", "--id="+gpuID)
	output, err := cmd.Output()
	if err == nil {
		health := strings.TrimSpace(string(output))
		if health == "Ok" || health == "Healthy" {
			return "HEALTHY", nil
		}
		return "DEGRADED", nil
	}

	// Try AMD GPU monitoring (rocm-smi)
	cmd = exec.CommandContext(ctx, "rocm-smi", "--showtemp", "--id", gpuID)
	output, err = cmd.Output()
	if err == nil {
		// Parse AMD GPU temperature
		lines := strings.Split(strings.TrimSpace(string(output)), "\n")
		for _, line := range lines {
			if strings.Contains(line, "Temperature") {
				var temp int
				if _, err := fmt.Sscanf(line, "%*s %d", &temp); err == nil {
					const maxTemp = 95  // AMD typical max temp
					if temp >= maxTemp {
						return "DEGRADED", nil
					}
					return "HEALTHY", nil
				}
			}
		}
	}

	// Try Intel GPU monitoring (intel_gpu_top or similar)
	// Note: Intel GPU monitoring varies by generation

	return "UNKNOWN", fmt.Errorf("could not determine GPU health - no compatible monitoring tool found")
}

// checkAllocatedGPUHealth checks health of an allocated GPU
func (m *Manager) checkAllocatedGPUHealth(ctx context.Context, gpuID string) (string, error) {
	// Check GPU utilization and temperature
	cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=utilization.gpu,temperature.gpu", "--format=csv,noheader", "--id="+gpuID)
	output, err := cmd.Output()
	if err != nil {
		return "UNKNOWN", err
	}

	// Parse output
	parts := strings.Split(strings.TrimSpace(string(output)), ",")
	if len(parts) >= 2 {
		// Extract utilization and temperature
		_ = strings.TrimSpace(parts[0]) // utilStr - reserved for future use
		tempStr := strings.TrimSpace(parts[1])

		// Parse temperature (remove % and extract number)
		tempParts := strings.Fields(tempStr)
		if len(tempParts) > 0 {
			var temp int
			if _, err := fmt.Sscanf(tempParts[0], "%d", &temp); err == nil {
				// Temperature thresholds
				const maxTemp = 83  // NVIDIA default max temp
				const warningTemp = 75

				if temp >= maxTemp {
					return "DEGRADED", nil
				}
				if temp >= warningTemp {
					return "WARNING", nil
				}
			}
		}

		return "HEALTHY", nil
	}

	return "UNKNOWN", fmt.Errorf("could not parse GPU metrics")
}

// ListGPUs lists all available GPUs
func (m *Manager) ListGPUs(ctx context.Context) ([]GPUInfo, error) {
	// Discover GPUs using nvidia-smi or similar
	cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader")
	output, err := cmd.Output()
	if err != nil {
		// If nvidia-smi is not available, return inventory
		gpus := make([]GPUInfo, 0, len(m.gpuInventory))
		for _, gpu := range m.gpuInventory {
			gpus = append(gpus, gpu)
		}
		return gpus, nil
	}

	// Parse nvidia-smi output and update inventory
	lines := strings.Split(strings.TrimSpace(string(output)), "\n")
	for _, line := range lines {
		parts := strings.Split(line, ",")
		if len(parts) >= 3 {
			gpuID := strings.TrimSpace(parts[0])
			gpuType := strings.TrimSpace(parts[1])

			if _, exists := m.gpuInventory[gpuID]; !exists {
				m.gpuInventory[gpuID] = GPUInfo{
					ID:     gpuID,
					Type:   gpuType,
					Status: "AVAILABLE",
					Health: "HEALTHY",
				}
			}
		}
	}

	gpus := make([]GPUInfo, 0, len(m.gpuInventory))
	for _, gpu := range m.gpuInventory {
		gpus = append(gpus, gpu)
	}
	return gpus, nil
}