Files
Sankofa/crossplane-provider-proxmox/pkg/controller/virtualmachine/controller.go
defiQUG 9daf1fd378 Apply Composer changes: comprehensive API updates, migrations, middleware, and infrastructure improvements
- Add comprehensive database migrations (001-024) for schema evolution
- Enhance API schema with expanded type definitions and resolvers
- Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth
- Implement new services: AI optimization, billing, blockchain, compliance, marketplace
- Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage)
- Update Crossplane provider with enhanced VM management capabilities
- Add comprehensive test suite for API endpoints and services
- Update frontend components with improved GraphQL subscriptions and real-time updates
- Enhance security configurations and headers (CSP, CORS, etc.)
- Update documentation and configuration files
- Add new CI/CD workflows and validation scripts
- Implement design system improvements and UI enhancements
2025-12-12 18:01:35 -08:00

555 lines
17 KiB
Go

package virtualmachine
import (
"context"
"fmt"
"os"
"strconv"
"strings"
"time"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
proxmoxv1alpha1 "github.com/sankofa/crossplane-provider-proxmox/apis/v1alpha1"
"github.com/sankofa/crossplane-provider-proxmox/pkg/proxmox"
"github.com/sankofa/crossplane-provider-proxmox/pkg/quota"
)
// ProxmoxVMReconciler reconciles a ProxmoxVM object
type ProxmoxVMReconciler struct {
client.Client
Scheme *runtime.Scheme
}
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/finalizers,verbs=update
// Reconcile is part of the main kubernetes reconciliation loop
func (r *ProxmoxVMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var vm proxmoxv1alpha1.ProxmoxVM
if err := r.Get(ctx, req.NamespacedName, &vm); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
// Validate ProviderConfigReference
if vm.Spec.ProviderConfigReference == nil {
return ctrl.Result{}, errors.New("providerConfigRef is required")
}
if vm.Spec.ProviderConfigReference.Name == "" {
return ctrl.Result{}, errors.New("providerConfigRef.name is required")
}
// Get ProviderConfig
var providerConfig proxmoxv1alpha1.ProviderConfig
providerConfigName := vm.Spec.ProviderConfigReference.Name
if err := r.Get(ctx, client.ObjectKey{Name: providerConfigName}, &providerConfig); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "cannot get provider config %s", providerConfigName)
}
// Track retry attempts for exponential backoff
attemptCount := 0
if vm.Status.Conditions != nil {
for _, condition := range vm.Status.Conditions {
if condition.Type == "Failed" {
attemptCount++
}
}
}
// Get credentials from secret
creds, err := r.getCredentials(ctx, &providerConfig)
if err != nil {
logger.Error(err, "cannot get credentials")
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot get credentials")
}
// Find the site configuration
site, err := r.findSite(&providerConfig, vm.Spec.ForProvider.Site)
if err != nil {
logger.Error(err, "cannot find site", "site", vm.Spec.ForProvider.Site)
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrapf(err, "cannot find site %s", vm.Spec.ForProvider.Site)
}
// Create Proxmox client
proxmoxClient, err := proxmox.NewClient(
site.Endpoint,
creds.Username,
creds.Password,
site.InsecureSkipTLSVerify,
)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot create Proxmox client")
}
// Check node health before proceeding
if err := proxmoxClient.CheckNodeHealth(ctx, vm.Spec.ForProvider.Node); err != nil {
logger.Error(err, "node health check failed", "node", vm.Spec.ForProvider.Node)
// Update status with error condition
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "NodeUnhealthy",
Status: "True",
Reason: "HealthCheckFailed",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil
}
// Reconcile VM
if vm.Status.VMID == 0 {
// Create VM
logger.Info("Creating VM", "name", vm.Name, "node", vm.Spec.ForProvider.Node)
// Extract tenant_id from Kubernetes labels or annotations (for multi-tenancy)
tenantID := ""
if vm.Labels != nil {
if tid, ok := vm.Labels["tenant.sankofa.nexus/id"]; ok {
tenantID = tid
} else if tid, ok := vm.Labels["tenant-id"]; ok {
tenantID = tid
}
}
if tenantID == "" && vm.Annotations != nil {
if tid, ok := vm.Annotations["tenant.sankofa.nexus/id"]; ok {
tenantID = tid
} else if tid, ok := vm.Annotations["tenant-id"]; ok {
tenantID = tid
}
}
// Enforce quota if tenant ID is present
if tenantID != "" {
apiURL := os.Getenv("SANKOFA_API_URL")
apiToken := os.Getenv("SANKOFA_API_TOKEN")
if apiURL != "" && apiToken != "" {
quotaClient := quota.NewQuotaClient(apiURL, apiToken)
// Parse memory from string (e.g., "8Gi" -> 8)
memoryGB := parseMemoryToGB(vm.Spec.ForProvider.Memory)
diskGB := parseDiskToGB(vm.Spec.ForProvider.Disk)
resourceRequest := quota.ResourceRequest{
Compute: &quota.ComputeRequest{
VCPU: &vm.Spec.ForProvider.CPU,
Memory: &memoryGB,
Instances: intPtr(1),
},
Storage: &quota.StorageRequest{
Size: &diskGB,
},
}
if err := quotaClient.EnforceQuota(ctx, tenantID, resourceRequest); err != nil {
logger.Error(err, "Quota check failed", "tenantID", tenantID)
return ctrl.Result{RequeueAfter: 60 * time.Second}, errors.Wrap(err, "quota exceeded")
}
logger.Info("Quota check passed", "tenantID", tenantID)
} else {
logger.Info("Skipping quota check - API URL or token not configured")
}
}
vmSpec := proxmox.VMSpec{
Node: vm.Spec.ForProvider.Node,
Name: vm.Spec.ForProvider.Name,
CPU: vm.Spec.ForProvider.CPU,
Memory: vm.Spec.ForProvider.Memory,
Disk: vm.Spec.ForProvider.Disk,
Storage: vm.Spec.ForProvider.Storage,
Network: vm.Spec.ForProvider.Network,
Image: vm.Spec.ForProvider.Image,
UserData: vm.Spec.ForProvider.UserData,
SSHKeys: vm.Spec.ForProvider.SSHKeys,
TenantID: tenantID, // Pass tenant_id to Proxmox client
}
createdVM, err := proxmoxClient.CreateVM(ctx, vmSpec)
if err != nil {
// CRITICAL: Check if VM was partially created (VMID assigned but creation failed)
// This happens when importdisk fails after VM is created
errorStr := err.Error()
if strings.Contains(errorStr, "VM") && strings.Contains(errorStr, "has been cleaned up") {
// VM was created but cleaned up due to error (e.g., importdisk not supported)
// Categorize error and update status to prevent infinite retry loop
errorCategory := categorizeError(errorStr)
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: errorCategory.Type,
Status: "True",
Reason: errorCategory.Reason,
Message: errorStr,
LastTransitionTime: metav1.Now(),
})
if updateErr := r.Status().Update(ctx, &vm); updateErr != nil {
logger.Error(updateErr, "failed to update status after creation failure")
}
// Use exponential backoff based on attempt count
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM - VM was cleaned up")
}
// For other errors, check if a VM was created but not returned
// Try to find orphaned VMs by name
vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node)
if listErr == nil {
for _, existingVM := range vms {
if existingVM.Name == vm.Spec.ForProvider.Name {
// Found orphaned VM - attempt cleanup
logger.Info("Found orphaned VM, attempting cleanup", "vmID", existingVM.ID, "name", existingVM.Name)
cleanupErr := proxmoxClient.DeleteVM(ctx, existingVM.ID)
if cleanupErr != nil {
logger.Error(cleanupErr, "Failed to cleanup orphaned VM", "vmID", existingVM.ID)
} else {
logger.Info("Successfully cleaned up orphaned VM", "vmID", existingVM.ID)
}
}
}
}
// Categorize error and update status with appropriate condition
errorCategory := categorizeError(errorStr)
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: errorCategory.Type,
Status: "True",
Reason: errorCategory.Reason,
Message: errorStr,
LastTransitionTime: metav1.Now(),
})
if updateErr := r.Status().Update(ctx, &vm); updateErr != nil {
logger.Error(updateErr, "failed to update status after creation failure")
}
// Use exponential backoff based on attempt count
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM")
}
vm.Status.VMID = createdVM.ID
vm.Status.State = createdVM.Status
vm.Status.IPAddress = createdVM.IP
// Clear any previous failure conditions
for i := len(vm.Status.Conditions) - 1; i >= 0; i-- {
if vm.Status.Conditions[i].Type == "Failed" {
vm.Status.Conditions = append(vm.Status.Conditions[:i], vm.Status.Conditions[i+1:]...)
}
}
// Add success condition
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "Ready",
Status: "True",
Reason: "Created",
Message: fmt.Sprintf("VM %d created successfully", createdVM.ID),
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, &vm); err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM status")
}
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
// Update VM status
vmStatus, err := proxmoxClient.GetVMStatus(ctx, vm.Status.VMID)
if err != nil {
// If VM status can't be retrieved, try to get from ListVMs
vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node)
if listErr != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot get VM status")
}
for _, v := range vms {
if v.ID == vm.Status.VMID {
vm.Status.State = v.Status
vm.Status.IPAddress = v.IP
break
}
}
} else {
vm.Status.State = vmStatus.State
vm.Status.IPAddress = vmStatus.IPAddress
}
if err := r.Status().Update(ctx, &vm); err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM status")
}
// Check if VM needs to be updated
needsUpdate := false
if vmStatus != nil {
// Compare with current status - for now, always check if spec differs
// In a real implementation, you'd compare current VM config with spec
needsUpdate = vm.Spec.ForProvider.CPU > 0 || vm.Spec.ForProvider.Memory != ""
}
if needsUpdate {
logger.Info("Updating VM", "name", vm.Name, "vmId", vm.Status.VMID)
vmSpec := proxmox.VMSpec{
Node: vm.Spec.ForProvider.Node,
Name: vm.Spec.ForProvider.Name,
CPU: vm.Spec.ForProvider.CPU,
Memory: vm.Spec.ForProvider.Memory,
}
_, err := proxmoxClient.UpdateVM(ctx, vm.Status.VMID, vmSpec)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM")
}
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
// CleanupOrphanedVMs scans for and cleans up orphaned VMs on controller startup
// Orphaned VMs are VMs in Proxmox that don't have corresponding Kubernetes resources
func (r *ProxmoxVMReconciler) CleanupOrphanedVMs(ctx context.Context) error {
logger := log.FromContext(ctx)
logger.Info("Starting orphaned VM cleanup on controller startup")
// List all ProxmoxVM resources
var vmList proxmoxv1alpha1.ProxmoxVMList
if err := r.List(ctx, &vmList, &client.ListOptions{}); err != nil {
logger.Error(err, "failed to list ProxmoxVM resources")
return err
}
// Build map of expected VMIDs
expectedVMIDs := make(map[int]bool)
for _, vm := range vmList.Items {
if vm.Status.VMID > 0 {
expectedVMIDs[vm.Status.VMID] = true
}
}
// Get all ProviderConfigs to check VMs on all sites
var configList proxmoxv1alpha1.ProviderConfigList
if err := r.List(ctx, &configList, &client.ListOptions{}); err != nil {
logger.Error(err, "failed to list ProviderConfig resources")
return err
}
cleanedCount := 0
for _, config := range configList.Items {
// Get credentials
creds, err := r.getCredentials(ctx, &config)
if err != nil {
logger.Error(err, "failed to get credentials for cleanup", "config", config.Name)
continue
}
// Check each site
for _, site := range config.Spec.Sites {
client, err := proxmox.NewClient(
site.Endpoint,
creds.Username,
creds.Password,
site.InsecureSkipTLSVerify,
)
if err != nil {
logger.Error(err, "failed to create Proxmox client for cleanup", "site", site.Name)
continue
}
// List VMs on this node
vms, err := client.ListVMs(ctx, site.Node)
if err != nil {
logger.Error(err, "failed to list VMs for cleanup", "site", site.Name, "node", site.Node)
continue
}
// Check for orphaned VMs (VMs not in expected list)
for _, vm := range vms {
if !expectedVMIDs[vm.ID] {
// Check if VM has a name that suggests it might be orphaned
// (e.g., matches pattern of our VMs but no Kubernetes resource)
logger.Info("Found potential orphaned VM", "vmID", vm.ID, "name", vm.Name, "node", site.Node)
// Only clean up if VM is stopped (safer)
if vm.Status == "stopped" {
logger.Info("Cleaning up orphaned stopped VM", "vmID", vm.ID)
if err := client.DeleteVM(ctx, vm.ID); err != nil {
logger.Error(err, "failed to cleanup orphaned VM", "vmID", vm.ID)
} else {
cleanedCount++
logger.Info("Successfully cleaned up orphaned VM", "vmID", vm.ID)
}
} else {
logger.Info("Skipping orphaned VM cleanup - VM is not stopped", "vmID", vm.ID, "status", vm.Status)
}
}
}
}
}
if cleanedCount > 0 {
logger.Info("Orphaned VM cleanup completed", "cleanedCount", cleanedCount)
} else {
logger.Info("Orphaned VM cleanup completed - no orphaned VMs found")
}
return nil
}
// SetupWithManager sets up the controller with the Manager
func (r *ProxmoxVMReconciler) SetupWithManager(mgr ctrl.Manager) error {
// Run cleanup on startup (non-blocking, in background)
go func() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
logger := log.FromContext(ctx)
if err := r.CleanupOrphanedVMs(ctx); err != nil {
logger.Error(err, "orphaned VM cleanup failed on startup")
}
}()
return ctrl.NewControllerManagedBy(mgr).
For(&proxmoxv1alpha1.ProxmoxVM{}).
Complete(r)
}
// Helper functions
type credentials struct {
Username string
Password string
}
func (r *ProxmoxVMReconciler) getCredentials(ctx context.Context, config *proxmoxv1alpha1.ProviderConfig) (*credentials, error) {
if config.Spec.Credentials.SecretRef == nil {
return nil, fmt.Errorf("no secret reference in provider config")
}
secretRef := config.Spec.Credentials.SecretRef
// Get secret from Kubernetes
secret := &corev1.Secret{}
secretKey := client.ObjectKey{
Namespace: secretRef.Namespace,
Name: secretRef.Name,
}
if err := r.Get(ctx, secretKey, secret); err != nil {
return nil, errors.Wrap(err, "cannot get secret")
}
// Parse credentials from secret
// Support both username/password and JSON token formats
var username, password string
// Try username/password format first
if userData, ok := secret.Data["username"]; ok {
username = string(userData)
}
if passData, ok := secret.Data["password"]; ok {
password = string(passData)
}
// Try token format (for Proxmox API tokens)
if tokenData, ok := secret.Data["token"]; ok {
// For token-based auth, username is in format "user@realm!tokenid"
// and password is the token secret
if userData, ok := secret.Data["tokenid"]; ok {
username = string(userData)
}
password = string(tokenData)
}
if username == "" || password == "" {
return nil, fmt.Errorf("username/password or token missing in secret")
}
return &credentials{
Username: username,
Password: password,
}, nil
}
func (r *ProxmoxVMReconciler) findSite(config *proxmoxv1alpha1.ProviderConfig, siteName string) (*proxmoxv1alpha1.ProxmoxSite, error) {
for _, site := range config.Spec.Sites {
if site.Name == siteName {
return &site, nil
}
}
return nil, fmt.Errorf("site %s not found", siteName)
}
// Helper functions for quota enforcement
func parseMemoryToGB(memory string) int {
if memory == "" {
return 0
}
// Remove whitespace and convert to lowercase
memory = strings.TrimSpace(strings.ToLower(memory))
// Parse memory string (e.g., "8Gi", "8G", "8192Mi")
if strings.HasSuffix(memory, "gi") || strings.HasSuffix(memory, "g") {
value, err := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(memory, "gi"), "g"))
if err == nil {
return value
}
} else if strings.HasSuffix(memory, "mi") || strings.HasSuffix(memory, "m") {
value, err := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(memory, "mi"), "m"))
if err == nil {
return value / 1024 // Convert MiB to GiB
}
} else {
// Try parsing as number (assume GB)
value, err := strconv.Atoi(memory)
if err == nil {
return value
}
}
return 0
}
func parseDiskToGB(disk string) int {
if disk == "" {
return 0
}
// Remove whitespace and convert to lowercase
disk = strings.TrimSpace(strings.ToLower(disk))
// Parse disk string (e.g., "100Gi", "100G", "100Ti")
if strings.HasSuffix(disk, "gi") || strings.HasSuffix(disk, "g") {
value, err := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(disk, "gi"), "g"))
if err == nil {
return value
}
} else if strings.HasSuffix(disk, "ti") || strings.HasSuffix(disk, "t") {
value, err := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(disk, "ti"), "t"))
if err == nil {
return value * 1024 // Convert TiB to GiB
}
} else {
// Try parsing as number (assume GB)
value, err := strconv.Atoi(disk)
if err == nil {
return value
}
}
return 0
}
func intPtr(i int) *int {
return &i
}