From 118e5ded65407b8b3f9184a816cc55c03d3ec685 Mon Sep 17 00:00:00 2001 From: Rajesh Putta Venkata Date: Thu, 13 Feb 2025 20:46:26 -0800 Subject: [PATCH] Add vsphere failure domain validation --- pkg/api/v1alpha1/cluster_types.go | 3 + .../vsphere/reconciler/reconciler.go | 18 ++ .../vsphere/reconciler/reconciler_test.go | 67 +++++ pkg/providers/vsphere/validator.go | 37 ++- pkg/providers/vsphere/validator_test.go | 279 ++++++++++++++++-- pkg/providers/vsphere/vsphere.go | 4 +- 6 files changed, 385 insertions(+), 23 deletions(-) diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go index ae850db62405f..c456afab6867d 100644 --- a/pkg/api/v1alpha1/cluster_types.go +++ b/pkg/api/v1alpha1/cluster_types.go @@ -966,6 +966,9 @@ const ( // MachineConfigInvalidReason reports that the Cluster machineconfig validation has failed. MachineConfigInvalidReason FailureReasonType = "MachineConfigInvalid" + // FailureDomainInvalidReason reports that the Cluster failure domain validation has failed. + FailureDomainInvalidReason FailureReasonType = "FailureDomainInvalid" + // UnavailableControlPlaneIPReason reports that the Cluster controlPlaneIP is already in use. UnavailableControlPlaneIPReason FailureReasonType = "UnavailableControlPlaneIP" diff --git a/pkg/providers/vsphere/reconciler/reconciler.go b/pkg/providers/vsphere/reconciler/reconciler.go index 2b4c167446ba0..6e350d17e0f69 100644 --- a/pkg/providers/vsphere/reconciler/reconciler.go +++ b/pkg/providers/vsphere/reconciler/reconciler.go @@ -122,6 +122,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, log logr.Logger, cluster *an r.ipValidator.ValidateControlPlaneIP, r.ValidateDatacenterConfig, r.ValidateMachineConfigs, + r.ValidateFailureDomains, clusters.CleanupStatusAfterValidate, r.ReconcileFailureDomains, r.ReconcileControlPlane, @@ -173,6 +174,23 @@ func (r *Reconciler) ValidateMachineConfigs(ctx context.Context, log logr.Logger return controller.Result{}, nil } +// ValidateFailureDomains performs validations for the provided failure domains and the assigned failure domains in worker node group. +func (r *Reconciler) ValidateFailureDomains(_ context.Context, log logr.Logger, clusterSpec *c.Spec) (controller.Result, error) { + if features.IsActive(features.VsphereFailureDomainEnabled()) { + log = log.WithValues("phase", "validateFailureDomains") + + vsphereClusterSpec := vsphere.NewSpec(clusterSpec) + + if err := r.validator.ValidateFailureDomains(vsphereClusterSpec); err != nil { + log.Error(err, "Invalid Failure domain setup") + failureMessage := err.Error() + clusterSpec.Cluster.SetFailure(anywherev1.FailureDomainInvalidReason, failureMessage) + return controller.ResultWithReturn(), nil + } + } + return controller.Result{}, nil +} + // ReconcileFailureDomains applies the Vsphere FailureDomain objects to the cluster. // It also takes care of deleting the old Vsphere FailureDomains that are not in in the cluster spec anymore. func (r *Reconciler) ReconcileFailureDomains(ctx context.Context, log logr.Logger, spec *c.Spec) (controller.Result, error) { diff --git a/pkg/providers/vsphere/reconciler/reconciler_test.go b/pkg/providers/vsphere/reconciler/reconciler_test.go index 373532b419d39..a22256d09b2de 100644 --- a/pkg/providers/vsphere/reconciler/reconciler_test.go +++ b/pkg/providers/vsphere/reconciler/reconciler_test.go @@ -188,6 +188,73 @@ func TestReconcilerFailureDomainsSuccess(t *testing.T) { features.ClearCache() } +func TestValidateFailureDomainsSuccess(t *testing.T) { + features.ClearCache() + t.Setenv(features.VSphereFailureDomainEnabledEnvVar, "true") + tt := newReconcilerTest(t) + tt.eksaSupportObjs = append(tt.eksaSupportObjs, test.CAPICluster(func(c *clusterv1.Cluster) { + c.Name = tt.cluster.Name + })) + tt.createAllObjs() + + spec := tt.buildSpec() + spec.VSphereDatacenter.Spec.Server = "myServer" + spec.VSphereDatacenter.Spec.Datacenter = "myDatacenter" + spec.VSphereDatacenter.Spec.Network = "/myDatacenter/network/myNetwork" + spec.VSphereDatacenter.Spec.FailureDomains = []anywherev1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + } + + result, err := tt.reconciler().ValidateFailureDomains(tt.ctx, test.NewNullLogger(), spec) + + tt.Expect(err).NotTo(HaveOccurred()) + tt.Expect(tt.cluster.Status.FailureMessage).To(BeZero()) + tt.Expect(tt.cluster.Status.FailureReason).To(BeZero()) + tt.Expect(result).To(Equal(controller.Result{})) + features.ClearCache() +} + +func TestValidateFailureDomainsFailure(t *testing.T) { + features.ClearCache() + t.Setenv(features.VSphereFailureDomainEnabledEnvVar, "true") + tt := newReconcilerTest(t) + tt.eksaSupportObjs = append(tt.eksaSupportObjs, test.CAPICluster(func(c *clusterv1.Cluster) { + c.Name = tt.cluster.Name + })) + tt.createAllObjs() + + spec := tt.buildSpec() + spec.Cluster.Spec.WorkerNodeGroupConfigurations[0].FailureDomains = []string{"fd-2"} + spec.VSphereDatacenter.Spec.Server = "myServer" + spec.VSphereDatacenter.Spec.Datacenter = "myDatacenter" + spec.VSphereDatacenter.Spec.Network = "/myDatacenter/network/myNetwork" + spec.VSphereDatacenter.Spec.FailureDomains = []anywherev1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + } + + result, err := tt.reconciler().ValidateFailureDomains(tt.ctx, test.NewNullLogger(), spec) + + tt.Expect(err).To(BeNil(), "error should be nil to prevent requeue") + tt.Expect(result).To(Equal(controller.Result{Result: &reconcile.Result{}}), "result should stop reconciliation") + tt.Expect(tt.cluster.Status.FailureMessage).To(HaveValue(ContainSubstring("provided invalid failure domain"))) + tt.Expect(tt.cluster.Status.FailureReason).To(HaveValue(Equal(anywherev1.FailureDomainInvalidReason))) + features.ClearCache() +} + func TestReconcilerReconcileInvalidDatacenterConfig(t *testing.T) { tt := newReconcilerTest(t) logger := test.NewNullLogger() diff --git a/pkg/providers/vsphere/validator.go b/pkg/providers/vsphere/validator.go index f4d78e752e2d7..997f7a2480e35 100644 --- a/pkg/providers/vsphere/validator.go +++ b/pkg/providers/vsphere/validator.go @@ -12,6 +12,7 @@ import ( "gopkg.in/yaml.v2" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" + "github.com/aws/eks-anywhere/pkg/collection" "github.com/aws/eks-anywhere/pkg/config" "github.com/aws/eks-anywhere/pkg/features" "github.com/aws/eks-anywhere/pkg/govmomi" @@ -90,10 +91,40 @@ func (v *Validator) ValidateVCenterConfig(ctx context.Context, datacenterConfig } // ValidateFailureDomains validates the provided list of failure domains. -func (v *Validator) ValidateFailureDomains(datacenterConfig *anywherev1.VSphereDatacenterConfig) error { - if !features.IsActive(features.VsphereFailureDomainEnabled()) && len(datacenterConfig.Spec.FailureDomains) > 0 { - return fmt.Errorf("Failure Domains feature is not enabled. Please set the env variable %v", features.VSphereFailureDomainEnabledEnvVar) +func (v *Validator) ValidateFailureDomains(vsphereClusterSpec *Spec) error { + if !features.IsActive(features.VsphereFailureDomainEnabled()) { + if len(vsphereClusterSpec.VSphereDatacenter.Spec.FailureDomains) > 0 { + return fmt.Errorf("Failure Domains feature is not enabled. Please set the env variable %v", features.VSphereFailureDomainEnabledEnvVar) + } + return nil + } + + if err := vsphereClusterSpec.VSphereDatacenter.Validate(); err != nil { + return err } + + providedFailureDomiains := collection.MapSet(vsphereClusterSpec.VSphereDatacenter.Spec.FailureDomains, func(fd anywherev1.FailureDomain) string { + return fd.Name + }) + + for _, wng := range vsphereClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { + + if len(wng.FailureDomains) == 0 { + continue + } + + if len(wng.FailureDomains) > 1 { + return fmt.Errorf("multiple failure domains provided in the worker node group: %s. Please provide only one failure domain", wng.Name) + } + + assignedFailureDomain := wng.FailureDomains[0] + + if !providedFailureDomiains.Contains(assignedFailureDomain) { + return fmt.Errorf("provided invalid failure domain %s in the worker node group %s", assignedFailureDomain, wng.Name) + } + + } + return nil } diff --git a/pkg/providers/vsphere/validator_test.go b/pkg/providers/vsphere/validator_test.go index 878d6121f77c6..c4090f08651ad 100644 --- a/pkg/providers/vsphere/validator_test.go +++ b/pkg/providers/vsphere/validator_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "errors" "fmt" + "strconv" "testing" "github.com/golang/mock/gomock" @@ -368,29 +369,271 @@ func TestValidatorValidateMachineConfigTemplateDoesNotExist(t *testing.T) { g.Expect(err).To(MatchError("validating template: not found")) } -func TestValidateFailureDomainsFailure(t *testing.T) { - ctrl := gomock.NewController(t) - govc := govcmocks.NewMockProviderGovcClient(ctrl) - - v := Validator{ - govc: govc, - } - - datacenterConfig := &v1alpha1.VSphereDatacenterConfig{ - Spec: v1alpha1.VSphereDatacenterConfigSpec{ - Datacenter: "SDDC-Datacenter", - FailureDomains: []v1alpha1.FailureDomain{ - { - Name: "fd-1", +func TestValidateFailureDomains(t *testing.T) { + tests := []struct { + name string + spec *Spec + expectedErr string + enableVsphereFailureDomain bool + }{ + { + name: "TestValidateFailureDomains success case", + enableVsphereFailureDomain: true, + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &v1alpha1.Cluster{ + Spec: v1alpha1.ClusterSpec{ + WorkerNodeGroupConfigurations: []v1alpha1.WorkerNodeGroupConfiguration{ + { + Name: "wd-1", + FailureDomains: []string{"fd-1"}, + }, + }, + }, + }, + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "myDatacenter", + Server: "myServer", + Network: "/myDatacenter/network/myNetwork", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "TestValidateFailureDomains with feature flag disabled and with failure domains", + enableVsphereFailureDomain: false, + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "SDDC-Datacenter", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + }, + }, + }, + }, + }, + }, + }, + expectedErr: "Failure Domains feature is not enabled", + }, + { + name: "TestValidateFailureDomains with feature flag disabled and without failure domains", + enableVsphereFailureDomain: false, + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "SDDC-Datacenter", + }, + }, + }, + }, + }, + }, + { + name: "TestValidateFailureDomains invalid failure domain case", + enableVsphereFailureDomain: true, + expectedErr: "network is not set or is empty", + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &v1alpha1.Cluster{ + Spec: v1alpha1.ClusterSpec{ + WorkerNodeGroupConfigurations: []v1alpha1.WorkerNodeGroupConfiguration{ + { + Name: "wd-1", + FailureDomains: []string{"fd-1"}, + }, + }, + }, + }, + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "myDatacenter", + Server: "myServer", + Network: "/myDatacenter/network/myNetwork", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "TestValidateFailureDomains worker node group without assigned failure domain case", + enableVsphereFailureDomain: true, + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &v1alpha1.Cluster{ + Spec: v1alpha1.ClusterSpec{ + WorkerNodeGroupConfigurations: []v1alpha1.WorkerNodeGroupConfiguration{ + { + Name: "wd-1", + }, + }, + }, + }, + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "myDatacenter", + Server: "myServer", + Network: "/myDatacenter/network/myNetwork", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "TestValidateFailureDomains worker node group with more than assigned failure domain case", + enableVsphereFailureDomain: true, + expectedErr: "multiple failure domains provided in the worker node group", + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &v1alpha1.Cluster{ + Spec: v1alpha1.ClusterSpec{ + WorkerNodeGroupConfigurations: []v1alpha1.WorkerNodeGroupConfiguration{ + { + Name: "wd-1", + FailureDomains: []string{"fd-1", "fd-2"}, + }, + }, + }, + }, + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "myDatacenter", + Server: "myServer", + Network: "/myDatacenter/network/myNetwork", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + { + Name: "fd-2", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "TestValidateFailureDomains worker node group with invalid assigned failure domain", + enableVsphereFailureDomain: true, + expectedErr: "provided invalid failure domain", + spec: &Spec{ + Spec: &cluster.Spec{ + Config: &cluster.Config{ + Cluster: &v1alpha1.Cluster{ + Spec: v1alpha1.ClusterSpec{ + WorkerNodeGroupConfigurations: []v1alpha1.WorkerNodeGroupConfiguration{ + { + Name: "wd-1", + FailureDomains: []string{"fd-3"}, + }, + }, + }, + }, + VSphereDatacenter: &v1alpha1.VSphereDatacenterConfig{ + Spec: v1alpha1.VSphereDatacenterConfigSpec{ + Datacenter: "myDatacenter", + Server: "myServer", + Network: "/myDatacenter/network/myNetwork", + FailureDomains: []v1alpha1.FailureDomain{ + { + Name: "fd-1", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + { + Name: "fd-2", + ComputeCluster: "myComputeCluster", + ResourcePool: "myResourcePool", + Datastore: "myDatastore", + Folder: "myFolder", + Network: "/myDatacenter/network/myNetwork", + }, + }, + }, + }, + }, }, }, }, } - t.Setenv(features.VSphereFailureDomainEnabledEnvVar, "false") - err := v.ValidateFailureDomains(datacenterConfig) - assert.NotNil(t, err) - features.ClearCache() + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(features.VSphereFailureDomainEnabledEnvVar, strconv.FormatBool(tt.enableVsphereFailureDomain)) + ctrl := gomock.NewController(t) + govc := govcmocks.NewMockProviderGovcClient(ctrl) + + v := Validator{ + govc: govc, + } + err := v.ValidateFailureDomains(tt.spec) + if tt.expectedErr != "" { + assert.Contains(t, err.Error(), tt.expectedErr) + } else { + assert.Nil(t, err) + } + features.ClearCache() + }) + } } func TestValidateBRHardDiskSize(t *testing.T) { diff --git a/pkg/providers/vsphere/vsphere.go b/pkg/providers/vsphere/vsphere.go index e02137282b0ab..c91df5a9109da 100644 --- a/pkg/providers/vsphere/vsphere.go +++ b/pkg/providers/vsphere/vsphere.go @@ -320,7 +320,7 @@ func (p *vsphereProvider) SetupAndValidateCreateCluster(ctx context.Context, clu return err } - if err := p.validator.ValidateFailureDomains(vSphereClusterSpec.VSphereDatacenter); err != nil { + if err := p.validator.ValidateFailureDomains(vSphereClusterSpec); err != nil { return err } @@ -402,7 +402,7 @@ func (p *vsphereProvider) SetupAndValidateUpgradeCluster(ctx context.Context, cl return err } - if err := p.validator.ValidateFailureDomains(vSphereClusterSpec.VSphereDatacenter); err != nil { + if err := p.validator.ValidateFailureDomains(vSphereClusterSpec); err != nil { return err }