package virtualmachine import ( "context" "fmt" "os" "strconv" "strings" "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" proxmoxv1alpha1 "github.com/sankofa/crossplane-provider-proxmox/apis/v1alpha1" "github.com/sankofa/crossplane-provider-proxmox/pkg/proxmox" "github.com/sankofa/crossplane-provider-proxmox/pkg/quota" "github.com/sankofa/crossplane-provider-proxmox/pkg/utils" ) // ProxmoxVMReconciler reconciles a ProxmoxVM object type ProxmoxVMReconciler struct { client.Client Scheme *runtime.Scheme } //+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/status,verbs=get;update;patch //+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/finalizers,verbs=update // Reconcile is part of the main kubernetes reconciliation loop func (r *ProxmoxVMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) var vm proxmoxv1alpha1.ProxmoxVM if err := r.Get(ctx, req.NamespacedName, &vm); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } // Validate ProviderConfigReference if vm.Spec.ProviderConfigReference == nil { return ctrl.Result{}, errors.New("providerConfigRef is required") } if vm.Spec.ProviderConfigReference.Name == "" { return ctrl.Result{}, errors.New("providerConfigRef.name is required") } // Get ProviderConfig var providerConfig proxmoxv1alpha1.ProviderConfig providerConfigName := vm.Spec.ProviderConfigReference.Name if err := r.Get(ctx, client.ObjectKey{Name: providerConfigName}, &providerConfig); err != nil { return ctrl.Result{}, errors.Wrapf(err, "cannot get provider config %s", providerConfigName) } // Track retry attempts for exponential backoff attemptCount := 0 if vm.Status.Conditions != nil { for _, condition := range vm.Status.Conditions { if condition.Type == "Failed" { attemptCount++ } } } // Get credentials from secret creds, err := r.getCredentials(ctx, &providerConfig) if err != nil { logger.Error(err, "cannot get credentials") delay := GetRequeueDelay(err, attemptCount) return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot get credentials") } // Find the site configuration site, err := r.findSite(&providerConfig, vm.Spec.ForProvider.Site) if err != nil { logger.Error(err, "cannot find site", "site", vm.Spec.ForProvider.Site) delay := GetRequeueDelay(err, attemptCount) return ctrl.Result{RequeueAfter: delay}, errors.Wrapf(err, "cannot find site %s", vm.Spec.ForProvider.Site) } // Create Proxmox client proxmoxClient, err := proxmox.NewClient( site.Endpoint, creds.Username, creds.Password, site.InsecureSkipTLSVerify, ) if err != nil { return ctrl.Result{}, errors.Wrap(err, "cannot create Proxmox client") } // Check node health before proceeding if err := proxmoxClient.CheckNodeHealth(ctx, vm.Spec.ForProvider.Node); err != nil { logger.Error(err, "node health check failed", "node", vm.Spec.ForProvider.Node) // Update status with error condition vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "NodeUnhealthy", Status: "True", Reason: "HealthCheckFailed", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil } // Validate network bridge exists on node if vm.Spec.ForProvider.Network != "" { networkExists, err := proxmoxClient.NetworkExists(ctx, vm.Spec.ForProvider.Node, vm.Spec.ForProvider.Network) if err != nil { logger.Error(err, "failed to check network bridge", "node", vm.Spec.ForProvider.Node, "network", vm.Spec.ForProvider.Network) // Don't fail on check error - network might exist but API call failed } else if !networkExists { err := fmt.Errorf("network bridge '%s' does not exist on node '%s'", vm.Spec.ForProvider.Network, vm.Spec.ForProvider.Node) logger.Error(err, "network bridge validation failed") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "NetworkNotFound", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "network bridge validation failed") } } // Reconcile VM if vm.Status.VMID == 0 { // Validate VM specification before creation if err := utils.ValidateVMName(vm.Spec.ForProvider.Name); err != nil { logger.Error(err, "invalid VM name") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidVMName", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid VM name") } if err := utils.ValidateMemory(vm.Spec.ForProvider.Memory); err != nil { logger.Error(err, "invalid memory specification") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidMemory", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid memory specification") } if err := utils.ValidateDisk(vm.Spec.ForProvider.Disk); err != nil { logger.Error(err, "invalid disk specification") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidDisk", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid disk specification") } if err := utils.ValidateCPU(vm.Spec.ForProvider.CPU); err != nil { logger.Error(err, "invalid CPU specification") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidCPU", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid CPU specification") } if err := utils.ValidateNetworkBridge(vm.Spec.ForProvider.Network); err != nil { logger.Error(err, "invalid network bridge specification") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidNetwork", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid network bridge specification") } if err := utils.ValidateImageSpec(vm.Spec.ForProvider.Image); err != nil { logger.Error(err, "invalid image specification") vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "ValidationFailed", Status: "True", Reason: "InvalidImage", Message: err.Error(), LastTransitionTime: metav1.Now(), }) r.Status().Update(ctx, &vm) return ctrl.Result{}, errors.Wrap(err, "invalid image specification") } // Create VM logger.Info("Creating VM", "name", vm.Name, "node", vm.Spec.ForProvider.Node) // Extract tenant_id from Kubernetes labels or annotations (for multi-tenancy) tenantID := "" if vm.Labels != nil { if tid, ok := vm.Labels["tenant.sankofa.nexus/id"]; ok { tenantID = tid } else if tid, ok := vm.Labels["tenant-id"]; ok { tenantID = tid } } if tenantID == "" && vm.Annotations != nil { if tid, ok := vm.Annotations["tenant.sankofa.nexus/id"]; ok { tenantID = tid } else if tid, ok := vm.Annotations["tenant-id"]; ok { tenantID = tid } } // Enforce quota if tenant ID is present if tenantID != "" { apiURL := os.Getenv("SANKOFA_API_URL") apiToken := os.Getenv("SANKOFA_API_TOKEN") if apiURL != "" && apiToken != "" { quotaClient := quota.NewQuotaClient(apiURL, apiToken) // Parse memory from string (e.g., "8Gi" -> 8) memoryGB := utils.ParseMemoryToGB(vm.Spec.ForProvider.Memory) diskGB := utils.ParseDiskToGB(vm.Spec.ForProvider.Disk) resourceRequest := quota.ResourceRequest{ Compute: "a.ComputeRequest{ VCPU: &vm.Spec.ForProvider.CPU, Memory: &memoryGB, Instances: intPtr(1), }, Storage: "a.StorageRequest{ Size: &diskGB, }, } if err := quotaClient.EnforceQuota(ctx, tenantID, resourceRequest); err != nil { logger.Error(err, "Quota check failed", "tenantID", tenantID) return ctrl.Result{RequeueAfter: 60 * time.Second}, errors.Wrap(err, "quota exceeded") } logger.Info("Quota check passed", "tenantID", tenantID) } else { logger.Info("Skipping quota check - API URL or token not configured") } } vmSpec := proxmox.VMSpec{ Node: vm.Spec.ForProvider.Node, Name: vm.Spec.ForProvider.Name, CPU: vm.Spec.ForProvider.CPU, Memory: vm.Spec.ForProvider.Memory, Disk: vm.Spec.ForProvider.Disk, Storage: vm.Spec.ForProvider.Storage, Network: vm.Spec.ForProvider.Network, Image: vm.Spec.ForProvider.Image, UserData: vm.Spec.ForProvider.UserData, SSHKeys: vm.Spec.ForProvider.SSHKeys, TenantID: tenantID, // Pass tenant_id to Proxmox client } createdVM, err := proxmoxClient.CreateVM(ctx, vmSpec) if err != nil { // CRITICAL: Check if VM was partially created (VMID assigned but creation failed) // This happens when importdisk fails after VM is created errorStr := err.Error() if strings.Contains(errorStr, "VM") && strings.Contains(errorStr, "has been cleaned up") { // VM was created but cleaned up due to error (e.g., importdisk not supported) // Categorize error and update status to prevent infinite retry loop errorCategory := categorizeError(errorStr) vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: errorCategory.Type, Status: "True", Reason: errorCategory.Reason, Message: errorStr, LastTransitionTime: metav1.Now(), }) if updateErr := r.Status().Update(ctx, &vm); updateErr != nil { logger.Error(updateErr, "failed to update status after creation failure") } // Use exponential backoff based on attempt count delay := GetRequeueDelay(err, attemptCount) return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM - VM was cleaned up") } // For other errors, check if a VM was created but not returned // Try to find orphaned VMs by name vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node) if listErr == nil { for _, existingVM := range vms { if existingVM.Name == vm.Spec.ForProvider.Name { // Found orphaned VM - attempt cleanup logger.Info("Found orphaned VM, attempting cleanup", "vmID", existingVM.ID, "name", existingVM.Name) cleanupErr := proxmoxClient.DeleteVM(ctx, existingVM.ID) if cleanupErr != nil { logger.Error(cleanupErr, "Failed to cleanup orphaned VM", "vmID", existingVM.ID) } else { logger.Info("Successfully cleaned up orphaned VM", "vmID", existingVM.ID) } } } } // Categorize error and update status with appropriate condition errorCategory := categorizeError(errorStr) vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: errorCategory.Type, Status: "True", Reason: errorCategory.Reason, Message: errorStr, LastTransitionTime: metav1.Now(), }) if updateErr := r.Status().Update(ctx, &vm); updateErr != nil { logger.Error(updateErr, "failed to update status after creation failure") } // Use exponential backoff based on attempt count delay := GetRequeueDelay(err, attemptCount) return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM") } vm.Status.VMID = createdVM.ID // Set initial status conservatively - VM is created but may not be running yet vm.Status.State = "created" // Use "created" instead of actual status until verified // IP address may not be available immediately - will be updated in next reconcile vm.Status.IPAddress = "" // Clear any previous failure conditions for i := len(vm.Status.Conditions) - 1; i >= 0; i-- { if vm.Status.Conditions[i].Type == "Failed" { vm.Status.Conditions = append(vm.Status.Conditions[:i], vm.Status.Conditions[i+1:]...) } } // Add success condition vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{ Type: "Ready", Status: "True", Reason: "Created", Message: fmt.Sprintf("VM %d created successfully", createdVM.ID), LastTransitionTime: metav1.Now(), }) if err := r.Status().Update(ctx, &vm); err != nil { return ctrl.Result{}, errors.Wrap(err, "cannot update VM status") } return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } // Update VM status vmStatus, err := proxmoxClient.GetVMStatus(ctx, vm.Status.VMID) if err != nil { // If VM status can't be retrieved, try to get from ListVMs vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node) if listErr != nil { return ctrl.Result{}, errors.Wrap(err, "cannot get VM status") } for _, v := range vms { if v.ID == vm.Status.VMID { vm.Status.State = v.Status vm.Status.IPAddress = v.IP break } } } else { vm.Status.State = vmStatus.State vm.Status.IPAddress = vmStatus.IPAddress } if err := r.Status().Update(ctx, &vm); err != nil { return ctrl.Result{}, errors.Wrap(err, "cannot update VM status") } // Check if VM needs to be updated needsUpdate := false if vmStatus != nil { // Compare with current status - for now, always check if spec differs // In a real implementation, you'd compare current VM config with spec needsUpdate = vm.Spec.ForProvider.CPU > 0 || vm.Spec.ForProvider.Memory != "" } if needsUpdate { logger.Info("Updating VM", "name", vm.Name, "vmId", vm.Status.VMID) vmSpec := proxmox.VMSpec{ Node: vm.Spec.ForProvider.Node, Name: vm.Spec.ForProvider.Name, CPU: vm.Spec.ForProvider.CPU, Memory: vm.Spec.ForProvider.Memory, } _, err := proxmoxClient.UpdateVM(ctx, vm.Status.VMID, vmSpec) if err != nil { return ctrl.Result{}, errors.Wrap(err, "cannot update VM") } return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } // CleanupOrphanedVMs scans for and cleans up orphaned VMs on controller startup // Orphaned VMs are VMs in Proxmox that don't have corresponding Kubernetes resources func (r *ProxmoxVMReconciler) CleanupOrphanedVMs(ctx context.Context) error { logger := log.FromContext(ctx) logger.Info("Starting orphaned VM cleanup on controller startup") // List all ProxmoxVM resources var vmList proxmoxv1alpha1.ProxmoxVMList if err := r.List(ctx, &vmList, &client.ListOptions{}); err != nil { logger.Error(err, "failed to list ProxmoxVM resources") return err } // Build map of expected VMIDs expectedVMIDs := make(map[int]bool) for _, vm := range vmList.Items { if vm.Status.VMID > 0 { expectedVMIDs[vm.Status.VMID] = true } } // Get all ProviderConfigs to check VMs on all sites var configList proxmoxv1alpha1.ProviderConfigList if err := r.List(ctx, &configList, &client.ListOptions{}); err != nil { logger.Error(err, "failed to list ProviderConfig resources") return err } cleanedCount := 0 for _, config := range configList.Items { // Get credentials creds, err := r.getCredentials(ctx, &config) if err != nil { logger.Error(err, "failed to get credentials for cleanup", "config", config.Name) continue } // Check each site for _, site := range config.Spec.Sites { client, err := proxmox.NewClient( site.Endpoint, creds.Username, creds.Password, site.InsecureSkipTLSVerify, ) if err != nil { logger.Error(err, "failed to create Proxmox client for cleanup", "site", site.Name) continue } // List VMs on this node vms, err := client.ListVMs(ctx, site.Node) if err != nil { logger.Error(err, "failed to list VMs for cleanup", "site", site.Name, "node", site.Node) continue } // Check for orphaned VMs (VMs not in expected list) for _, vm := range vms { if !expectedVMIDs[vm.ID] { // Check if VM has a name that suggests it might be orphaned // (e.g., matches pattern of our VMs but no Kubernetes resource) logger.Info("Found potential orphaned VM", "vmID", vm.ID, "name", vm.Name, "node", site.Node) // Only clean up if VM is stopped (safer) if vm.Status == "stopped" { logger.Info("Cleaning up orphaned stopped VM", "vmID", vm.ID) if err := client.DeleteVM(ctx, vm.ID); err != nil { logger.Error(err, "failed to cleanup orphaned VM", "vmID", vm.ID) } else { cleanedCount++ logger.Info("Successfully cleaned up orphaned VM", "vmID", vm.ID) } } else { logger.Info("Skipping orphaned VM cleanup - VM is not stopped", "vmID", vm.ID, "status", vm.Status) } } } } } if cleanedCount > 0 { logger.Info("Orphaned VM cleanup completed", "cleanedCount", cleanedCount) } else { logger.Info("Orphaned VM cleanup completed - no orphaned VMs found") } return nil } // SetupWithManager sets up the controller with the Manager func (r *ProxmoxVMReconciler) SetupWithManager(mgr ctrl.Manager) error { // Run cleanup on startup (non-blocking, in background) go func() { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() logger := log.FromContext(ctx) if err := r.CleanupOrphanedVMs(ctx); err != nil { logger.Error(err, "orphaned VM cleanup failed on startup") } }() return ctrl.NewControllerManagedBy(mgr). For(&proxmoxv1alpha1.ProxmoxVM{}). Complete(r) } // Helper functions type credentials struct { Username string Password string } func (r *ProxmoxVMReconciler) getCredentials(ctx context.Context, config *proxmoxv1alpha1.ProviderConfig) (*credentials, error) { if config.Spec.Credentials.SecretRef == nil { return nil, fmt.Errorf("no secret reference in provider config") } secretRef := config.Spec.Credentials.SecretRef // Get secret from Kubernetes secret := &corev1.Secret{} secretKey := client.ObjectKey{ Namespace: secretRef.Namespace, Name: secretRef.Name, } if err := r.Get(ctx, secretKey, secret); err != nil { return nil, errors.Wrap(err, "cannot get secret") } // Parse credentials from secret // Support both username/password and JSON token formats var username, password string // Try username/password format first if userData, ok := secret.Data["username"]; ok { username = string(userData) } if passData, ok := secret.Data["password"]; ok { password = string(passData) } // Try token format (for Proxmox API tokens) if tokenData, ok := secret.Data["token"]; ok { // For token-based auth, username is in format "user@realm!tokenid" // and password is the token secret if userData, ok := secret.Data["tokenid"]; ok { username = string(userData) } password = string(tokenData) } if username == "" || password == "" { return nil, fmt.Errorf("username/password or token missing in secret") } return &credentials{ Username: username, Password: password, }, nil } func (r *ProxmoxVMReconciler) findSite(config *proxmoxv1alpha1.ProviderConfig, siteName string) (*proxmoxv1alpha1.ProxmoxSite, error) { for _, site := range config.Spec.Sites { if site.Name == siteName { return &site, nil } } return nil, fmt.Errorf("site %s not found", siteName) } // Helper functions for quota enforcement (use shared utils) func intPtr(i int) *int { return &i }