Skip to content

Commit

Permalink
Add vsphere failure domain validation
Browse files Browse the repository at this point in the history
  • Loading branch information
rajeshvenkata committed Feb 14, 2025
1 parent 7ac7afd commit 118e5de
Show file tree
Hide file tree
Showing 6 changed files with 385 additions and 23 deletions.
3 changes: 3 additions & 0 deletions pkg/api/v1alpha1/cluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -966,6 +966,9 @@ const (
// MachineConfigInvalidReason reports that the Cluster machineconfig validation has failed.
MachineConfigInvalidReason FailureReasonType = "MachineConfigInvalid"

// FailureDomainInvalidReason reports that the Cluster failure domain validation has failed.
FailureDomainInvalidReason FailureReasonType = "FailureDomainInvalid"

// UnavailableControlPlaneIPReason reports that the Cluster controlPlaneIP is already in use.
UnavailableControlPlaneIPReason FailureReasonType = "UnavailableControlPlaneIP"

Expand Down
18 changes: 18 additions & 0 deletions pkg/providers/vsphere/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, log logr.Logger, cluster *an
r.ipValidator.ValidateControlPlaneIP,
r.ValidateDatacenterConfig,
r.ValidateMachineConfigs,
r.ValidateFailureDomains,

Check warning on line 125 in pkg/providers/vsphere/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/vsphere/reconciler/reconciler.go#L125

Added line #L125 was not covered by tests
clusters.CleanupStatusAfterValidate,
r.ReconcileFailureDomains,

Check warning on line 127 in pkg/providers/vsphere/reconciler/reconciler.go

View check run for this annotation

Codecov / codecov/patch

pkg/providers/vsphere/reconciler/reconciler.go#L127

Added line #L127 was not covered by tests
r.ReconcileControlPlane,
Expand Down Expand Up @@ -173,6 +174,23 @@ func (r *Reconciler) ValidateMachineConfigs(ctx context.Context, log logr.Logger
return controller.Result{}, nil
}

// ValidateFailureDomains performs validations for the provided failure domains and the assigned failure domains in worker node group.
func (r *Reconciler) ValidateFailureDomains(_ context.Context, log logr.Logger, clusterSpec *c.Spec) (controller.Result, error) {
if features.IsActive(features.VsphereFailureDomainEnabled()) {
log = log.WithValues("phase", "validateFailureDomains")

vsphereClusterSpec := vsphere.NewSpec(clusterSpec)

if err := r.validator.ValidateFailureDomains(vsphereClusterSpec); err != nil {
log.Error(err, "Invalid Failure domain setup")
failureMessage := err.Error()
clusterSpec.Cluster.SetFailure(anywherev1.FailureDomainInvalidReason, failureMessage)
return controller.ResultWithReturn(), nil
}
}
return controller.Result{}, nil
}

// ReconcileFailureDomains applies the Vsphere FailureDomain objects to the cluster.
// It also takes care of deleting the old Vsphere FailureDomains that are not in in the cluster spec anymore.
func (r *Reconciler) ReconcileFailureDomains(ctx context.Context, log logr.Logger, spec *c.Spec) (controller.Result, error) {
Expand Down
67 changes: 67 additions & 0 deletions pkg/providers/vsphere/reconciler/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,73 @@ func TestReconcilerFailureDomainsSuccess(t *testing.T) {
features.ClearCache()
}

func TestValidateFailureDomainsSuccess(t *testing.T) {
features.ClearCache()
t.Setenv(features.VSphereFailureDomainEnabledEnvVar, "true")
tt := newReconcilerTest(t)
tt.eksaSupportObjs = append(tt.eksaSupportObjs, test.CAPICluster(func(c *clusterv1.Cluster) {
c.Name = tt.cluster.Name
}))
tt.createAllObjs()

spec := tt.buildSpec()
spec.VSphereDatacenter.Spec.Server = "myServer"
spec.VSphereDatacenter.Spec.Datacenter = "myDatacenter"
spec.VSphereDatacenter.Spec.Network = "/myDatacenter/network/myNetwork"
spec.VSphereDatacenter.Spec.FailureDomains = []anywherev1.FailureDomain{
{
Name: "fd-1",
ComputeCluster: "myComputeCluster",
ResourcePool: "myResourcePool",
Datastore: "myDatastore",
Folder: "myFolder",
Network: "/myDatacenter/network/myNetwork",
},
}

result, err := tt.reconciler().ValidateFailureDomains(tt.ctx, test.NewNullLogger(), spec)

tt.Expect(err).NotTo(HaveOccurred())
tt.Expect(tt.cluster.Status.FailureMessage).To(BeZero())
tt.Expect(tt.cluster.Status.FailureReason).To(BeZero())
tt.Expect(result).To(Equal(controller.Result{}))
features.ClearCache()
}

func TestValidateFailureDomainsFailure(t *testing.T) {
features.ClearCache()
t.Setenv(features.VSphereFailureDomainEnabledEnvVar, "true")
tt := newReconcilerTest(t)
tt.eksaSupportObjs = append(tt.eksaSupportObjs, test.CAPICluster(func(c *clusterv1.Cluster) {
c.Name = tt.cluster.Name
}))
tt.createAllObjs()

spec := tt.buildSpec()
spec.Cluster.Spec.WorkerNodeGroupConfigurations[0].FailureDomains = []string{"fd-2"}
spec.VSphereDatacenter.Spec.Server = "myServer"
spec.VSphereDatacenter.Spec.Datacenter = "myDatacenter"
spec.VSphereDatacenter.Spec.Network = "/myDatacenter/network/myNetwork"
spec.VSphereDatacenter.Spec.FailureDomains = []anywherev1.FailureDomain{
{
Name: "fd-1",
ComputeCluster: "myComputeCluster",
ResourcePool: "myResourcePool",
Datastore: "myDatastore",
Folder: "myFolder",
Network: "/myDatacenter/network/myNetwork",
},
}

result, err := tt.reconciler().ValidateFailureDomains(tt.ctx, test.NewNullLogger(), spec)

tt.Expect(err).To(BeNil(), "error should be nil to prevent requeue")
tt.Expect(result).To(Equal(controller.Result{Result: &reconcile.Result{}}), "result should stop reconciliation")
tt.Expect(tt.cluster.Status.FailureMessage).To(HaveValue(ContainSubstring("provided invalid failure domain")))
tt.Expect(tt.cluster.Status.FailureReason).To(HaveValue(Equal(anywherev1.FailureDomainInvalidReason)))
features.ClearCache()
}

func TestReconcilerReconcileInvalidDatacenterConfig(t *testing.T) {
tt := newReconcilerTest(t)
logger := test.NewNullLogger()
Expand Down
37 changes: 34 additions & 3 deletions pkg/providers/vsphere/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"gopkg.in/yaml.v2"

anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/collection"
"github.com/aws/eks-anywhere/pkg/config"
"github.com/aws/eks-anywhere/pkg/features"
"github.com/aws/eks-anywhere/pkg/govmomi"
Expand Down Expand Up @@ -90,10 +91,40 @@ func (v *Validator) ValidateVCenterConfig(ctx context.Context, datacenterConfig
}

// ValidateFailureDomains validates the provided list of failure domains.
func (v *Validator) ValidateFailureDomains(datacenterConfig *anywherev1.VSphereDatacenterConfig) error {
if !features.IsActive(features.VsphereFailureDomainEnabled()) && len(datacenterConfig.Spec.FailureDomains) > 0 {
return fmt.Errorf("Failure Domains feature is not enabled. Please set the env variable %v", features.VSphereFailureDomainEnabledEnvVar)
func (v *Validator) ValidateFailureDomains(vsphereClusterSpec *Spec) error {
if !features.IsActive(features.VsphereFailureDomainEnabled()) {
if len(vsphereClusterSpec.VSphereDatacenter.Spec.FailureDomains) > 0 {
return fmt.Errorf("Failure Domains feature is not enabled. Please set the env variable %v", features.VSphereFailureDomainEnabledEnvVar)
}
return nil
}

if err := vsphereClusterSpec.VSphereDatacenter.Validate(); err != nil {
return err
}

providedFailureDomiains := collection.MapSet(vsphereClusterSpec.VSphereDatacenter.Spec.FailureDomains, func(fd anywherev1.FailureDomain) string {
return fd.Name
})

for _, wng := range vsphereClusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {

if len(wng.FailureDomains) == 0 {
continue
}

if len(wng.FailureDomains) > 1 {
return fmt.Errorf("multiple failure domains provided in the worker node group: %s. Please provide only one failure domain", wng.Name)
}

assignedFailureDomain := wng.FailureDomains[0]

if !providedFailureDomiains.Contains(assignedFailureDomain) {
return fmt.Errorf("provided invalid failure domain %s in the worker node group %s", assignedFailureDomain, wng.Name)
}

}

return nil
}

Expand Down
Loading

0 comments on commit 118e5de

Please sign in to comment.