Files
defiQUG 7cd7022f6e Update .gitignore, remove package-lock.json, and enhance Cloudflare and Proxmox adapters
- Added lock file exclusions for pnpm in .gitignore.
- Removed obsolete package-lock.json from the api and portal directories.
- Enhanced Cloudflare adapter with additional interfaces for zones and tunnels.
- Improved Proxmox adapter error handling and logging for API requests.
- Updated Proxmox VM parameters with validation rules in the API schema.
- Enhanced documentation for Proxmox VM specifications and examples.
2025-12-12 19:29:01 -08:00

599 lines
20 KiB
Go

package virtualmachine
import (
"context"
"fmt"
"os"
"strconv"
"strings"
"time"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
proxmoxv1alpha1 "github.com/sankofa/crossplane-provider-proxmox/apis/v1alpha1"
"github.com/sankofa/crossplane-provider-proxmox/pkg/proxmox"
"github.com/sankofa/crossplane-provider-proxmox/pkg/quota"
"github.com/sankofa/crossplane-provider-proxmox/pkg/utils"
)
// ProxmoxVMReconciler reconciles a ProxmoxVM object
type ProxmoxVMReconciler struct {
client.Client
Scheme *runtime.Scheme
}
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=proxmox.sankofa.nexus,resources=proxmoxvms/finalizers,verbs=update
// Reconcile is part of the main kubernetes reconciliation loop
func (r *ProxmoxVMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var vm proxmoxv1alpha1.ProxmoxVM
if err := r.Get(ctx, req.NamespacedName, &vm); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
// Validate ProviderConfigReference
if vm.Spec.ProviderConfigReference == nil {
return ctrl.Result{}, errors.New("providerConfigRef is required")
}
if vm.Spec.ProviderConfigReference.Name == "" {
return ctrl.Result{}, errors.New("providerConfigRef.name is required")
}
// Get ProviderConfig
var providerConfig proxmoxv1alpha1.ProviderConfig
providerConfigName := vm.Spec.ProviderConfigReference.Name
if err := r.Get(ctx, client.ObjectKey{Name: providerConfigName}, &providerConfig); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "cannot get provider config %s", providerConfigName)
}
// Track retry attempts for exponential backoff
attemptCount := 0
if vm.Status.Conditions != nil {
for _, condition := range vm.Status.Conditions {
if condition.Type == "Failed" {
attemptCount++
}
}
}
// Get credentials from secret
creds, err := r.getCredentials(ctx, &providerConfig)
if err != nil {
logger.Error(err, "cannot get credentials")
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot get credentials")
}
// Find the site configuration
site, err := r.findSite(&providerConfig, vm.Spec.ForProvider.Site)
if err != nil {
logger.Error(err, "cannot find site", "site", vm.Spec.ForProvider.Site)
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrapf(err, "cannot find site %s", vm.Spec.ForProvider.Site)
}
// Create Proxmox client
proxmoxClient, err := proxmox.NewClient(
site.Endpoint,
creds.Username,
creds.Password,
site.InsecureSkipTLSVerify,
)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot create Proxmox client")
}
// Check node health before proceeding
if err := proxmoxClient.CheckNodeHealth(ctx, vm.Spec.ForProvider.Node); err != nil {
logger.Error(err, "node health check failed", "node", vm.Spec.ForProvider.Node)
// Update status with error condition
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "NodeUnhealthy",
Status: "True",
Reason: "HealthCheckFailed",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil
}
// Validate network bridge exists on node
if vm.Spec.ForProvider.Network != "" {
networkExists, err := proxmoxClient.NetworkExists(ctx, vm.Spec.ForProvider.Node, vm.Spec.ForProvider.Network)
if err != nil {
logger.Error(err, "failed to check network bridge", "node", vm.Spec.ForProvider.Node, "network", vm.Spec.ForProvider.Network)
// Don't fail on check error - network might exist but API call failed
} else if !networkExists {
err := fmt.Errorf("network bridge '%s' does not exist on node '%s'", vm.Spec.ForProvider.Network, vm.Spec.ForProvider.Node)
logger.Error(err, "network bridge validation failed")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "NetworkNotFound",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "network bridge validation failed")
}
}
// Reconcile VM
if vm.Status.VMID == 0 {
// Validate VM specification before creation
if err := utils.ValidateVMName(vm.Spec.ForProvider.Name); err != nil {
logger.Error(err, "invalid VM name")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidVMName",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid VM name")
}
if err := utils.ValidateMemory(vm.Spec.ForProvider.Memory); err != nil {
logger.Error(err, "invalid memory specification")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidMemory",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid memory specification")
}
if err := utils.ValidateDisk(vm.Spec.ForProvider.Disk); err != nil {
logger.Error(err, "invalid disk specification")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidDisk",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid disk specification")
}
if err := utils.ValidateCPU(vm.Spec.ForProvider.CPU); err != nil {
logger.Error(err, "invalid CPU specification")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidCPU",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid CPU specification")
}
if err := utils.ValidateNetworkBridge(vm.Spec.ForProvider.Network); err != nil {
logger.Error(err, "invalid network bridge specification")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidNetwork",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid network bridge specification")
}
if err := utils.ValidateImageSpec(vm.Spec.ForProvider.Image); err != nil {
logger.Error(err, "invalid image specification")
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "ValidationFailed",
Status: "True",
Reason: "InvalidImage",
Message: err.Error(),
LastTransitionTime: metav1.Now(),
})
r.Status().Update(ctx, &vm)
return ctrl.Result{}, errors.Wrap(err, "invalid image specification")
}
// Create VM
logger.Info("Creating VM", "name", vm.Name, "node", vm.Spec.ForProvider.Node)
// Extract tenant_id from Kubernetes labels or annotations (for multi-tenancy)
tenantID := ""
if vm.Labels != nil {
if tid, ok := vm.Labels["tenant.sankofa.nexus/id"]; ok {
tenantID = tid
} else if tid, ok := vm.Labels["tenant-id"]; ok {
tenantID = tid
}
}
if tenantID == "" && vm.Annotations != nil {
if tid, ok := vm.Annotations["tenant.sankofa.nexus/id"]; ok {
tenantID = tid
} else if tid, ok := vm.Annotations["tenant-id"]; ok {
tenantID = tid
}
}
// Enforce quota if tenant ID is present
if tenantID != "" {
apiURL := os.Getenv("SANKOFA_API_URL")
apiToken := os.Getenv("SANKOFA_API_TOKEN")
if apiURL != "" && apiToken != "" {
quotaClient := quota.NewQuotaClient(apiURL, apiToken)
// Parse memory from string (e.g., "8Gi" -> 8)
memoryGB := utils.ParseMemoryToGB(vm.Spec.ForProvider.Memory)
diskGB := utils.ParseDiskToGB(vm.Spec.ForProvider.Disk)
resourceRequest := quota.ResourceRequest{
Compute: &quota.ComputeRequest{
VCPU: &vm.Spec.ForProvider.CPU,
Memory: &memoryGB,
Instances: intPtr(1),
},
Storage: &quota.StorageRequest{
Size: &diskGB,
},
}
if err := quotaClient.EnforceQuota(ctx, tenantID, resourceRequest); err != nil {
logger.Error(err, "Quota check failed", "tenantID", tenantID)
return ctrl.Result{RequeueAfter: 60 * time.Second}, errors.Wrap(err, "quota exceeded")
}
logger.Info("Quota check passed", "tenantID", tenantID)
} else {
logger.Info("Skipping quota check - API URL or token not configured")
}
}
vmSpec := proxmox.VMSpec{
Node: vm.Spec.ForProvider.Node,
Name: vm.Spec.ForProvider.Name,
CPU: vm.Spec.ForProvider.CPU,
Memory: vm.Spec.ForProvider.Memory,
Disk: vm.Spec.ForProvider.Disk,
Storage: vm.Spec.ForProvider.Storage,
Network: vm.Spec.ForProvider.Network,
Image: vm.Spec.ForProvider.Image,
UserData: vm.Spec.ForProvider.UserData,
SSHKeys: vm.Spec.ForProvider.SSHKeys,
TenantID: tenantID, // Pass tenant_id to Proxmox client
}
createdVM, err := proxmoxClient.CreateVM(ctx, vmSpec)
if err != nil {
// CRITICAL: Check if VM was partially created (VMID assigned but creation failed)
// This happens when importdisk fails after VM is created
errorStr := err.Error()
if strings.Contains(errorStr, "VM") && strings.Contains(errorStr, "has been cleaned up") {
// VM was created but cleaned up due to error (e.g., importdisk not supported)
// Categorize error and update status to prevent infinite retry loop
errorCategory := categorizeError(errorStr)
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: errorCategory.Type,
Status: "True",
Reason: errorCategory.Reason,
Message: errorStr,
LastTransitionTime: metav1.Now(),
})
if updateErr := r.Status().Update(ctx, &vm); updateErr != nil {
logger.Error(updateErr, "failed to update status after creation failure")
}
// Use exponential backoff based on attempt count
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM - VM was cleaned up")
}
// For other errors, check if a VM was created but not returned
// Try to find orphaned VMs by name
vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node)
if listErr == nil {
for _, existingVM := range vms {
if existingVM.Name == vm.Spec.ForProvider.Name {
// Found orphaned VM - attempt cleanup
logger.Info("Found orphaned VM, attempting cleanup", "vmID", existingVM.ID, "name", existingVM.Name)
cleanupErr := proxmoxClient.DeleteVM(ctx, existingVM.ID)
if cleanupErr != nil {
logger.Error(cleanupErr, "Failed to cleanup orphaned VM", "vmID", existingVM.ID)
} else {
logger.Info("Successfully cleaned up orphaned VM", "vmID", existingVM.ID)
}
}
}
}
// Categorize error and update status with appropriate condition
errorCategory := categorizeError(errorStr)
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: errorCategory.Type,
Status: "True",
Reason: errorCategory.Reason,
Message: errorStr,
LastTransitionTime: metav1.Now(),
})
if updateErr := r.Status().Update(ctx, &vm); updateErr != nil {
logger.Error(updateErr, "failed to update status after creation failure")
}
// Use exponential backoff based on attempt count
delay := GetRequeueDelay(err, attemptCount)
return ctrl.Result{RequeueAfter: delay}, errors.Wrap(err, "cannot create VM")
}
vm.Status.VMID = createdVM.ID
// Set initial status conservatively - VM is created but may not be running yet
vm.Status.State = "created" // Use "created" instead of actual status until verified
// IP address may not be available immediately - will be updated in next reconcile
vm.Status.IPAddress = ""
// Clear any previous failure conditions
for i := len(vm.Status.Conditions) - 1; i >= 0; i-- {
if vm.Status.Conditions[i].Type == "Failed" {
vm.Status.Conditions = append(vm.Status.Conditions[:i], vm.Status.Conditions[i+1:]...)
}
}
// Add success condition
vm.Status.Conditions = append(vm.Status.Conditions, metav1.Condition{
Type: "Ready",
Status: "True",
Reason: "Created",
Message: fmt.Sprintf("VM %d created successfully", createdVM.ID),
LastTransitionTime: metav1.Now(),
})
if err := r.Status().Update(ctx, &vm); err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM status")
}
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
// Update VM status
vmStatus, err := proxmoxClient.GetVMStatus(ctx, vm.Status.VMID)
if err != nil {
// If VM status can't be retrieved, try to get from ListVMs
vms, listErr := proxmoxClient.ListVMs(ctx, vm.Spec.ForProvider.Node)
if listErr != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot get VM status")
}
for _, v := range vms {
if v.ID == vm.Status.VMID {
vm.Status.State = v.Status
vm.Status.IPAddress = v.IP
break
}
}
} else {
vm.Status.State = vmStatus.State
vm.Status.IPAddress = vmStatus.IPAddress
}
if err := r.Status().Update(ctx, &vm); err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM status")
}
// Check if VM needs to be updated
needsUpdate := false
if vmStatus != nil {
// Compare with current status - for now, always check if spec differs
// In a real implementation, you'd compare current VM config with spec
needsUpdate = vm.Spec.ForProvider.CPU > 0 || vm.Spec.ForProvider.Memory != ""
}
if needsUpdate {
logger.Info("Updating VM", "name", vm.Name, "vmId", vm.Status.VMID)
vmSpec := proxmox.VMSpec{
Node: vm.Spec.ForProvider.Node,
Name: vm.Spec.ForProvider.Name,
CPU: vm.Spec.ForProvider.CPU,
Memory: vm.Spec.ForProvider.Memory,
}
_, err := proxmoxClient.UpdateVM(ctx, vm.Status.VMID, vmSpec)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "cannot update VM")
}
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
// CleanupOrphanedVMs scans for and cleans up orphaned VMs on controller startup
// Orphaned VMs are VMs in Proxmox that don't have corresponding Kubernetes resources
func (r *ProxmoxVMReconciler) CleanupOrphanedVMs(ctx context.Context) error {
logger := log.FromContext(ctx)
logger.Info("Starting orphaned VM cleanup on controller startup")
// List all ProxmoxVM resources
var vmList proxmoxv1alpha1.ProxmoxVMList
if err := r.List(ctx, &vmList, &client.ListOptions{}); err != nil {
logger.Error(err, "failed to list ProxmoxVM resources")
return err
}
// Build map of expected VMIDs
expectedVMIDs := make(map[int]bool)
for _, vm := range vmList.Items {
if vm.Status.VMID > 0 {
expectedVMIDs[vm.Status.VMID] = true
}
}
// Get all ProviderConfigs to check VMs on all sites
var configList proxmoxv1alpha1.ProviderConfigList
if err := r.List(ctx, &configList, &client.ListOptions{}); err != nil {
logger.Error(err, "failed to list ProviderConfig resources")
return err
}
cleanedCount := 0
for _, config := range configList.Items {
// Get credentials
creds, err := r.getCredentials(ctx, &config)
if err != nil {
logger.Error(err, "failed to get credentials for cleanup", "config", config.Name)
continue
}
// Check each site
for _, site := range config.Spec.Sites {
client, err := proxmox.NewClient(
site.Endpoint,
creds.Username,
creds.Password,
site.InsecureSkipTLSVerify,
)
if err != nil {
logger.Error(err, "failed to create Proxmox client for cleanup", "site", site.Name)
continue
}
// List VMs on this node
vms, err := client.ListVMs(ctx, site.Node)
if err != nil {
logger.Error(err, "failed to list VMs for cleanup", "site", site.Name, "node", site.Node)
continue
}
// Check for orphaned VMs (VMs not in expected list)
for _, vm := range vms {
if !expectedVMIDs[vm.ID] {
// Check if VM has a name that suggests it might be orphaned
// (e.g., matches pattern of our VMs but no Kubernetes resource)
logger.Info("Found potential orphaned VM", "vmID", vm.ID, "name", vm.Name, "node", site.Node)
// Only clean up if VM is stopped (safer)
if vm.Status == "stopped" {
logger.Info("Cleaning up orphaned stopped VM", "vmID", vm.ID)
if err := client.DeleteVM(ctx, vm.ID); err != nil {
logger.Error(err, "failed to cleanup orphaned VM", "vmID", vm.ID)
} else {
cleanedCount++
logger.Info("Successfully cleaned up orphaned VM", "vmID", vm.ID)
}
} else {
logger.Info("Skipping orphaned VM cleanup - VM is not stopped", "vmID", vm.ID, "status", vm.Status)
}
}
}
}
}
if cleanedCount > 0 {
logger.Info("Orphaned VM cleanup completed", "cleanedCount", cleanedCount)
} else {
logger.Info("Orphaned VM cleanup completed - no orphaned VMs found")
}
return nil
}
// SetupWithManager sets up the controller with the Manager
func (r *ProxmoxVMReconciler) SetupWithManager(mgr ctrl.Manager) error {
// Run cleanup on startup (non-blocking, in background)
go func() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
logger := log.FromContext(ctx)
if err := r.CleanupOrphanedVMs(ctx); err != nil {
logger.Error(err, "orphaned VM cleanup failed on startup")
}
}()
return ctrl.NewControllerManagedBy(mgr).
For(&proxmoxv1alpha1.ProxmoxVM{}).
Complete(r)
}
// Helper functions
type credentials struct {
Username string
Password string
}
func (r *ProxmoxVMReconciler) getCredentials(ctx context.Context, config *proxmoxv1alpha1.ProviderConfig) (*credentials, error) {
if config.Spec.Credentials.SecretRef == nil {
return nil, fmt.Errorf("no secret reference in provider config")
}
secretRef := config.Spec.Credentials.SecretRef
// Get secret from Kubernetes
secret := &corev1.Secret{}
secretKey := client.ObjectKey{
Namespace: secretRef.Namespace,
Name: secretRef.Name,
}
if err := r.Get(ctx, secretKey, secret); err != nil {
return nil, errors.Wrap(err, "cannot get secret")
}
// Parse credentials from secret
// Support both username/password and JSON token formats
var username, password string
// Try username/password format first
if userData, ok := secret.Data["username"]; ok {
username = string(userData)
}
if passData, ok := secret.Data["password"]; ok {
password = string(passData)
}
// Try token format (for Proxmox API tokens)
if tokenData, ok := secret.Data["token"]; ok {
// For token-based auth, username is in format "user@realm!tokenid"
// and password is the token secret
if userData, ok := secret.Data["tokenid"]; ok {
username = string(userData)
}
password = string(tokenData)
}
if username == "" || password == "" {
return nil, fmt.Errorf("username/password or token missing in secret")
}
return &credentials{
Username: username,
Password: password,
}, nil
}
func (r *ProxmoxVMReconciler) findSite(config *proxmoxv1alpha1.ProviderConfig, siteName string) (*proxmoxv1alpha1.ProxmoxSite, error) {
for _, site := range config.Spec.Sites {
if site.Name == siteName {
return &site, nil
}
}
return nil, fmt.Errorf("site %s not found", siteName)
}
// Helper functions for quota enforcement (use shared utils)
func intPtr(i int) *int {
return &i
}