From 5d54746e010854a1316e72733e13577d4d5695ec Mon Sep 17 00:00:00 2001 From: Christian Schlotter Date: Tue, 22 Oct 2024 21:10:51 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Update=20machineset=20with=20v1beta?= =?UTF-8?q?2=20status=20test=20(#11278)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * machineset: prepare splitting reconcile functions * improve and first unit tests * fixes * review fixes * review fixes * fix --- api/v1beta1/machineset_types.go | 75 ++ api/v1beta1/v1beta2_condition_consts.go | 36 +- .../controllers/machine/machine_controller.go | 7 +- .../machineset/machineset_controller.go | 389 ++++++--- .../machineset_controller_status.go | 326 ++++++++ .../machineset_controller_status_test.go | 775 ++++++++++++++++++ .../machineset/machineset_controller_test.go | 131 ++- util/log/log.go | 35 +- 8 files changed, 1597 insertions(+), 177 deletions(-) create mode 100644 internal/controllers/machineset/machineset_controller_status.go create mode 100644 internal/controllers/machineset/machineset_controller_status_test.go diff --git a/api/v1beta1/machineset_types.go b/api/v1beta1/machineset_types.go index e7dc0afd4767..de32436a1695 100644 --- a/api/v1beta1/machineset_types.go +++ b/api/v1beta1/machineset_types.go @@ -87,6 +87,81 @@ type MachineSetSpec struct { Template MachineTemplateSpec `json:"template,omitempty"` } +// MachineSet's ScalingUp condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineSetScalingUpV1Beta2Condition is true if actual replicas < desired replicas. + MachineSetScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition + + // MachineSetScalingUpV1Beta2Reason surfaces when actual replicas < desired replicas. + MachineSetScalingUpV1Beta2Reason = ScalingUpV1Beta2Reason + + // MachineSetNotScalingUpV1Beta2Reason surfaces when actual replicas >= desired replicas. + MachineSetNotScalingUpV1Beta2Reason = NotScalingUpV1Beta2Reason + + // MachineSetScalingUpInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines. + MachineSetScalingUpInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason + + // MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason surfaces when the .spec.replicas + // field of the MachineSet is not set. + MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason = WaitingForReplicasSetV1Beta2Reason +) + +// MachineSet's ScalingDown condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineSetScalingDownV1Beta2Condition is true if actual replicas > desired replicas. + MachineSetScalingDownV1Beta2Condition = ScalingDownV1Beta2Condition + + // MachineSetScalingDownV1Beta2Reason surfaces when actual replicas > desired replicas. + MachineSetScalingDownV1Beta2Reason = ScalingDownV1Beta2Reason + + // MachineSetNotScalingDownV1Beta2Reason surfaces when actual replicas <= desired replicas. + MachineSetNotScalingDownV1Beta2Reason = NotScalingDownV1Beta2Reason + + // MachineSetScalingDownInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines. + MachineSetScalingDownInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason + + // MachineSetScalingDownWaitingForReplicasSetV1Beta2Reason surfaces when the .spec.replicas + // field of the MachineSet is not set. + MachineSetScalingDownWaitingForReplicasSetV1Beta2Reason = WaitingForReplicasSetV1Beta2Reason +) + +// MachineSet's MachinesReady condition and corresponding reasons that will be used in v1Beta2 API version. +// Note: Reason's could also be derived from the aggregation of machine's Ready conditions. +const ( + // MachineSetMachinesReadyV1Beta2Condition surfaces detail of issues on the controlled machines, if any. + MachineSetMachinesReadyV1Beta2Condition = MachinesReadyV1Beta2Condition + + // MachineSetMachinesReadyNoReplicasV1Beta2Reason surfaces when no machines exist for the MachineSet. + MachineSetMachinesReadyNoReplicasV1Beta2Reason = "NoReplicas" + + // MachineSetMachinesReadyInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines + // or aggregating machine's conditions. + MachineSetMachinesReadyInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason +) + +// MachineSet's MachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. +// Note: Reason's could also be derived from the aggregation of machine's MachinesUpToDate conditions. +const ( + // MachineSetMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. + MachineSetMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition + + // MachineSetMachinesUpToDateNoReplicasV1Beta2Reason surfaces when no machines exist for the MachineSet. + MachineSetMachinesUpToDateNoReplicasV1Beta2Reason = "NoReplicas" + + // MachineSetMachinesUpToDateInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines + // or aggregating status. + MachineSetMachinesUpToDateInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason +) + +// Conditions that will be used for the MachineSet object in v1Beta2 API version. +const ( + // MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any. + MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition + + // MachineSetDeletingV1Beta2Condition surfaces details about ongoing deletion of the controlled machines. + MachineSetDeletingV1Beta2Condition = DeletingV1Beta2Condition +) + // ANCHOR_END: MachineSetSpec // ANCHOR: MachineTemplateSpec diff --git a/api/v1beta1/v1beta2_condition_consts.go b/api/v1beta1/v1beta2_condition_consts.go index d6784d0c3392..cd71b87b3448 100644 --- a/api/v1beta1/v1beta2_condition_consts.go +++ b/api/v1beta1/v1beta2_condition_consts.go @@ -90,6 +90,21 @@ const ( // AvailableV1Beta2Reason applies to a condition surfacing object availability. AvailableV1Beta2Reason = "Available" + // ScalingUpV1Beta2Reason surfaces when an object is scaling up. + ScalingUpV1Beta2Reason = "ScalingUp" + + // NotScalingUpV1Beta2Reason surfaces when an object is not scaling up. + NotScalingUpV1Beta2Reason = "NotScalingUp" + + // ScalingDownV1Beta2Reason surfaces when an object is scaling down. + ScalingDownV1Beta2Reason = "ScalingDown" + + // NotScalingDownV1Beta2Reason surfaces when an object is not scaling down. + NotScalingDownV1Beta2Reason = "NotScalingDown" + + // WaitingForReplicasSetV1Beta2Reason surfaces when the replica field of an object is not set. + WaitingForReplicasSetV1Beta2Reason = "WaitingForReplicasSet" + // InvalidConditionReportedV1Beta2Reason applies to a condition, usually read from an external object, that is invalid // (e.g. its status is missing). InvalidConditionReportedV1Beta2Reason = "InvalidConditionReported" @@ -144,27 +159,6 @@ const ( InspectionFailedV1Beta2Reason = "InspectionFailed" ) -// Conditions that will be used for the MachineSet object in v1Beta2 API version. -const ( - // MachineSetMachinesReadyV1Beta2Condition surfaces detail of issues on the controlled machines, if any. - MachineSetMachinesReadyV1Beta2Condition = MachinesReadyV1Beta2Condition - - // MachineSetMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. - MachineSetMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition - - // MachineSetScalingUpV1Beta2Condition is true if available replicas < desired replicas. - MachineSetScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition - - // MachineSetScalingDownV1Beta2Condition is true if replicas > desired replicas. - MachineSetScalingDownV1Beta2Condition = ScalingDownV1Beta2Condition - - // MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any. - MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition - - // MachineSetDeletingV1Beta2Condition surfaces details about ongoing deletion of the controlled machines. - MachineSetDeletingV1Beta2Condition = DeletingV1Beta2Condition -) - // Conditions that will be used for the MachineDeployment object in v1Beta2 API version. const ( // MachineDeploymentAvailableV1Beta2Condition is true if the MachineDeployment is not deleted, and it has minimum diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 9babad1c13a3..9eb029ef43b4 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -328,7 +328,12 @@ func doReconcile(ctx context.Context, phases []machineReconcileFunc, s *scope) ( } res = util.LowestNonZeroResult(res, phaseResult) } - return res, kerrors.NewAggregate(errs) + + if len(errs) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errs) + } + + return res, nil } // scope holds the different objects that are read and used during the reconcile. diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index ec81faa40593..22a7bad55a95 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -140,7 +140,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return nil } -func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ctrl.Result, reterr error) { machineSet := &clusterv1.MachineSet{} if err := r.Client.Get(ctx, req.NamespacedName, machineSet); err != nil { if apierrors.IsNotFound(err) { @@ -176,37 +176,129 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re return ctrl.Result{}, err } + s := &scope{ + cluster: cluster, + machineSet: machineSet, + } + // Initialize the patch helper - patchHelper, err := patch.NewHelper(machineSet, r.Client) + patchHelper, err := patch.NewHelper(s.machineSet, r.Client) if err != nil { return ctrl.Result{}, err } defer func() { + if err := r.reconcileStatus(ctx, s); err != nil { + reterr = kerrors.NewAggregate([]error{reterr, errors.Wrapf(err, "failed to update status")}) + } + + r.reconcileV1Beta2Status(ctx, s) + // Always attempt to patch the object and status after each reconciliation. - if err := patchMachineSet(ctx, patchHelper, machineSet); err != nil { + if err := patchMachineSet(ctx, patchHelper, s.machineSet); err != nil { reterr = kerrors.NewAggregate([]error{reterr, err}) } + + if reterr != nil { + retres = ctrl.Result{} + return + } + + // Adjust requeue when scaling up + if s.machineSet.DeletionTimestamp.IsZero() && reterr == nil { + retres = util.LowestNonZeroResult(retres, shouldRequeueForReplicaCountersRefresh(s)) + } }() + if isDeploymentChild(s.machineSet) { + // If the MachineSet is in a MachineDeployment, try to get the owning MachineDeployment. + s.owningMachineDeployment, err = r.getOwnerMachineDeployment(ctx, s.machineSet) + if err != nil { + return ctrl.Result{}, err + } + } + + alwaysReconcile := []machineSetReconcileFunc{ + wrapErrMachineSetReconcileFunc(r.reconcileMachineSetOwnerAndLabels, "failed to set MachineSet owner and labels"), + wrapErrMachineSetReconcileFunc(r.reconcileInfrastructure, "failed to reconcile infrastructure"), + wrapErrMachineSetReconcileFunc(r.reconcileBootstrapConfig, "failed to reconcile bootstrapConfig"), + wrapErrMachineSetReconcileFunc(r.getAndAdoptMachinesForMachineSet, "failed to get and adopt Machines for MachineSet"), + } + // Handle deletion reconciliation loop. - if !machineSet.DeletionTimestamp.IsZero() { - return ctrl.Result{}, r.reconcileDelete(ctx, machineSet) + if !s.machineSet.DeletionTimestamp.IsZero() { + reconcileDelete := append( + alwaysReconcile, + wrapErrMachineSetReconcileFunc(r.reconcileDelete, "failed to reconcile delete"), + ) + return doReconcile(ctx, s, reconcileDelete) } - result, err := r.reconcile(ctx, cluster, machineSet) - if err != nil { + reconcileNormal := append(alwaysReconcile, + wrapErrMachineSetReconcileFunc(r.reconcileUnhealthyMachines, "failed to reconcile unhealthy machines"), + wrapErrMachineSetReconcileFunc(r.syncMachines, "failed to sync Machines"), + wrapErrMachineSetReconcileFunc(r.syncReplicas, "failed to sync replicas"), + ) + + result, kerr := doReconcile(ctx, s, reconcileNormal) + if kerr != nil { // Requeue if the reconcile failed because connection to workload cluster was down. - if errors.Is(err, clustercache.ErrClusterNotConnected) { - log.V(5).Info("Requeuing because connection to the workload cluster is down") + if errors.Is(kerr, clustercache.ErrClusterNotConnected) { + if len(kerr.Errors()) > 1 { + log.Error(kerr, "Requeuing because connection to the workload cluster is down") + } else { + log.V(5).Info("Requeuing because connection to the workload cluster is down") + } return ctrl.Result{RequeueAfter: time.Minute}, nil } - r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) + err = kerr + r.recorder.Eventf(s.machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", kerr) } return result, err } -func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet, options ...patch.Option) error { +type scope struct { + machineSet *clusterv1.MachineSet + cluster *clusterv1.Cluster + machines []*clusterv1.Machine + bootstrapObjectNotFound bool + infrastructureObjectNotFound bool + getAndAdoptMachinesForMachineSetSucceeded bool + owningMachineDeployment *clusterv1.MachineDeployment +} + +type machineSetReconcileFunc func(ctx context.Context, s *scope) (ctrl.Result, error) + +func wrapErrMachineSetReconcileFunc(f machineSetReconcileFunc, msg string) machineSetReconcileFunc { + return func(ctx context.Context, s *scope) (ctrl.Result, error) { + res, err := f(ctx, s) + return res, errors.Wrap(err, msg) + } +} + +func doReconcile(ctx context.Context, s *scope, phases []machineSetReconcileFunc) (ctrl.Result, kerrors.Aggregate) { + res := ctrl.Result{} + errs := []error{} + for _, phase := range phases { + // Call the inner reconciliation methods. + phaseResult, err := phase(ctx, s) + if err != nil { + errs = append(errs, err) + } + if len(errs) > 0 { + continue + } + res = util.LowestNonZeroResult(res, phaseResult) + } + + if len(errs) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errs) + } + + return res, nil +} + +func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet) error { // Always update the readyCondition by summarizing the state of other conditions. conditions.SetSummary(machineSet, conditions.WithConditions( @@ -217,18 +309,28 @@ func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet ) // Patch the object, ignoring conflicts on the conditions owned by this controller. - options = append(options, + options := []patch.Option{ patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ clusterv1.ReadyCondition, clusterv1.MachinesCreatedCondition, clusterv1.ResizedCondition, clusterv1.MachinesReadyCondition, }}, - ) + patch.WithOwnedV1Beta2Conditions{Conditions: []string{ + clusterv1.MachineSetScalingUpV1Beta2Condition, + clusterv1.MachineSetScalingDownV1Beta2Condition, + clusterv1.MachineSetMachinesReadyV1Beta2Condition, + clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + clusterv1.MachineSetRemediatingV1Beta2Condition, + clusterv1.MachineSetDeletingV1Beta2Condition, + }}, + } return patchHelper.Patch(ctx, machineSet, options...) } -func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) { +func (r *Reconciler) reconcileMachineSetOwnerAndLabels(_ context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + cluster := s.cluster // Reconcile and retrieve the Cluster object. if machineSet.Labels == nil { machineSet.Labels = make(map[string]string) @@ -246,17 +348,6 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, })) } - // Make sure to reconcile the external infrastructure reference. - if err := r.reconcileExternalTemplateReference(ctx, cluster, machineSet, &machineSet.Spec.Template.Spec.InfrastructureRef); err != nil { - return ctrl.Result{}, err - } - // Make sure to reconcile the external bootstrap reference, if any. - if machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { - if err := r.reconcileExternalTemplateReference(ctx, cluster, machineSet, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef); err != nil { - return ctrl.Result{}, err - } - } - // Make sure selector and template to be in the same cluster. if machineSet.Spec.Selector.MatchLabels == nil { machineSet.Spec.Selector.MatchLabels = make(map[string]string) @@ -269,97 +360,70 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet.Spec.Selector.MatchLabels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName machineSet.Spec.Template.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName - machines, err := r.getAndAdoptMachinesForMachineSet(ctx, machineSet) - if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to list Machines") - } - - result := ctrl.Result{} + return ctrl.Result{}, nil +} - reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) +func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctrl.Result, error) { + cluster := s.cluster + machineSet := s.machineSet + // Make sure to reconcile the external infrastructure reference. + var err error + s.infrastructureObjectNotFound, err = r.reconcileExternalTemplateReference(ctx, cluster, machineSet, s.owningMachineDeployment, &machineSet.Spec.Template.Spec.InfrastructureRef) if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines") - } - result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult) - - if err := r.syncMachines(ctx, machineSet, machines); err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") - } - - syncReplicasResult, syncErr := r.syncReplicas(ctx, cluster, machineSet, machines) - result = util.LowestNonZeroResult(result, syncReplicasResult) - - // Always updates status as machines come up or die. - if err := r.updateStatus(ctx, cluster, machineSet, machines); err != nil { - return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate([]error{err, syncErr}), "failed to update MachineSet's Status") - } - - if syncErr != nil { - return ctrl.Result{}, errors.Wrapf(syncErr, "failed to sync MachineSet replicas") - } - - var replicas int32 - if machineSet.Spec.Replicas != nil { - replicas = *machineSet.Spec.Replicas + return ctrl.Result{}, err } + return ctrl.Result{}, nil +} - // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. - // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. - // A replica is available if the amount of time since last transition exceeds MinReadySeconds. - // If there was a clock skew, checking whether the amount of time since last transition to ready state - // exceeds MinReadySeconds could be incorrect. - // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, - // at which point it should confirm any available replica to be available. - if machineSet.Spec.MinReadySeconds > 0 && - machineSet.Status.ReadyReplicas == replicas && - machineSet.Status.AvailableReplicas != replicas { - minReadyResult := ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second} - result = util.LowestNonZeroResult(result, minReadyResult) - return result, nil +func (r *Reconciler) reconcileBootstrapConfig(ctx context.Context, s *scope) (ctrl.Result, error) { + cluster := s.cluster + machineSet := s.machineSet + // Make sure to reconcile the external bootstrap reference, if any. + if s.machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { + var err error + s.bootstrapObjectNotFound, err = r.reconcileExternalTemplateReference(ctx, cluster, machineSet, s.owningMachineDeployment, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef) + if err != nil { + return ctrl.Result{}, err + } } + return ctrl.Result{}, nil +} - // Quickly reconcile until the nodes become Ready. - if machineSet.Status.ReadyReplicas != replicas { - result = util.LowestNonZeroResult(result, ctrl.Result{RequeueAfter: 15 * time.Second}) - return result, nil +func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + machineList := s.machines + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil } - return result, nil -} - -func (r *Reconciler) reconcileDelete(ctx context.Context, machineSet *clusterv1.MachineSet) error { log := ctrl.LoggerFrom(ctx) - machineList, err := r.getAndAdoptMachinesForMachineSet(ctx, machineSet) - if err != nil { - return err - } // If all the descendant machines are deleted, then remove the machineset's finalizer. if len(machineList) == 0 { controllerutil.RemoveFinalizer(machineSet, clusterv1.MachineSetFinalizer) - return nil + return ctrl.Result{}, nil } - log.Info("Waiting for Machines to be deleted", "Machines", clog.ObjNamesString(machineList)) - // else delete owned machines. for _, machine := range machineList { if machine.DeletionTimestamp.IsZero() { log.Info("Deleting Machine", "Machine", klog.KObj(machine)) if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) { - return errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) } } } - return nil + log.Info("Waiting for Machines to be deleted", "Machines", clog.ObjNamesString(machineList)) + return ctrl.Result{}, nil } -func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machineSet *clusterv1.MachineSet) ([]*clusterv1.Machine, error) { +func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet log := ctrl.LoggerFrom(ctx) selectorMap, err := metav1.LabelSelectorAsMap(&machineSet.Spec.Selector) if err != nil { - return nil, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) + return ctrl.Result{}, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) } // Get all Machines linked to this MachineSet. @@ -370,7 +434,7 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi client.MatchingLabels(selectorMap), ) if err != nil { - return nil, errors.Wrap(err, "failed to list machines") + return ctrl.Result{}, errors.Wrap(err, "failed to list machines") } // Filter out irrelevant machines (i.e. IsControlledBy something else) and claim orphaned machines. @@ -398,7 +462,10 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi filteredMachines = append(filteredMachines, machine) } - return filteredMachines, nil + s.machines = filteredMachines + s.getAndAdoptMachinesForMachineSetSucceeded = true + + return ctrl.Result{}, nil } // syncMachines updates Machines, InfrastructureMachine and BootstrapConfig to propagate in-place mutable fields @@ -409,7 +476,13 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi // "metadata.annotations" from "manager" so that "capi-machineset" can own these fields and can work with SSA. // Otherwise fields would be co-owned by our "old" "manager" and "capi-machineset" and then we would not be // able to e.g. drop labels and annotations. -func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine) error { +func (r *Reconciler) syncMachines(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + machines := s.machines + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + log := ctrl.LoggerFrom(ctx) for i := range machines { m := machines[i] @@ -418,7 +491,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac if !m.DeletionTimestamp.IsZero() { patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { - return err + return ctrl.Result{}, err } // Set all other in-place mutable fields that impact the ability to tear down existing machines. @@ -428,7 +501,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac m.Spec.NodeVolumeDetachTimeout = machineSet.Spec.Template.Spec.NodeVolumeDetachTimeout if err := patchHelper.Patch(ctx, m); err != nil { - return err + return ctrl.Result{}, err } continue } @@ -438,7 +511,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac // (< v1.4.0) can also work with SSA. Otherwise, fields would be co-owned by our "old" "manager" and // "capi-machineset" and then we would not be able to e.g. drop labels and annotations. if err := ssa.CleanUpManagedFieldsForSSAAdoption(ctx, r.Client, m, machineSetManagerName); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) } // Update Machine to propagate in-place mutable fields from the MachineSet. @@ -446,13 +519,13 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: m}) if err != nil { log.Error(err, "Failed to update Machine", "Machine", klog.KObj(updatedMachine)) - return errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) } machines[i] = updatedMachine infraMachine, err := external.Get(ctx, r.Client, &updatedMachine.Spec.InfrastructureRef, updatedMachine.Namespace) if err != nil { - return errors.Wrapf(err, "failed to get InfrastructureMachine %s", + return ctrl.Result{}, errors.Wrapf(err, "failed to get InfrastructureMachine %s", klog.KRef(updatedMachine.Spec.InfrastructureRef.Namespace, updatedMachine.Spec.InfrastructureRef.Name)) } // Cleanup managed fields of all InfrastructureMachines to drop ownership of labels and annotations @@ -464,17 +537,17 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac {"f:metadata", "f:labels"}, } if err := ssa.DropManagedFields(ctx, r.Client, infraMachine, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) } // Update in-place mutating fields on InfrastructureMachine. if err := r.updateExternalObject(ctx, infraMachine, machineSet); err != nil { - return errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) } if updatedMachine.Spec.Bootstrap.ConfigRef != nil { bootstrapConfig, err := external.Get(ctx, r.Client, updatedMachine.Spec.Bootstrap.ConfigRef, updatedMachine.Namespace) if err != nil { - return errors.Wrapf(err, "failed to get BootstrapConfig %s", + return ctrl.Result{}, errors.Wrapf(err, "failed to get BootstrapConfig %s", klog.KRef(updatedMachine.Spec.Bootstrap.ConfigRef.Namespace, updatedMachine.Spec.Bootstrap.ConfigRef.Name)) } // Cleanup managed fields of all BootstrapConfigs to drop ownership of labels and annotations @@ -482,22 +555,29 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. if err := ssa.DropManagedFields(ctx, r.Client, bootstrapConfig, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) } // Update in-place mutating fields on BootstrapConfig. if err := r.updateExternalObject(ctx, bootstrapConfig, machineSet); err != nil { - return errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) } } } - return nil + return ctrl.Result{}, nil } // syncReplicas scales Machine resources up or down. -func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) (ctrl.Result, error) { +func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, error) { + ms := s.machineSet + machines := s.machines + cluster := s.cluster + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + log := ctrl.LoggerFrom(ctx) if ms.Spec.Replicas == nil { - return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) + return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for MachineSet %v is nil, this should not be allowed", ms.Name) } diff := len(machines) - int(*(ms.Spec.Replicas)) switch { @@ -921,11 +1001,19 @@ func (r *Reconciler) getMachineSetsForMachine(ctx context.Context, m *clusterv1. } // isDeploymentChild returns true if the MachineSet originated from a MachineDeployment by checking its labels. -func (r *Reconciler) isDeploymentChild(ms *clusterv1.MachineSet) bool { +func isDeploymentChild(ms *clusterv1.MachineSet) bool { _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel] return ok } +// isCurrentMachineSet returns true if the MachineSet's and MachineDeployments revision are equal. +func isCurrentMachineSet(ms *clusterv1.MachineSet, md *clusterv1.MachineDeployment) bool { + if md == nil { + return false + } + return md.Annotations[clusterv1.RevisionAnnotation] == ms.Annotations[clusterv1.RevisionAnnotation] +} + // shouldAdopt returns true if the MachineSet should be adopted as a stand-alone MachineSet directly owned by the Cluster. func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // if the MachineSet is controlled by a MachineDeployment, or if it is a stand-alone MachinesSet directly owned by the Cluster, then no-op. @@ -936,12 +1024,24 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // If the MachineSet is originated by a MachineDeployment object, it should not be adopted directly by the Cluster as a stand-alone MachineSet. // Note: this is required because after restore from a backup both the MachineSet controller and the // MachineDeployment controller are racing to adopt MachineSets, see https://github.com/kubernetes-sigs/cluster-api/issues/7529 - return !r.isDeploymentChild(ms) + return !isDeploymentChild(ms) } -// updateStatus updates the Status field for the MachineSet +// reconcileStatus updates the Status field for the MachineSet // It checks for the current state of the replicas and updates the Status of the MachineSet. -func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { +func (r *Reconciler) reconcileStatus(ctx context.Context, s *scope) error { + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return nil + } + + ms := s.machineSet + filteredMachines := s.machines + cluster := s.cluster + + if ms.Spec.Replicas == nil { + return errors.New("Cannot update status when MachineSet spec.replicas is not set") + } + log := ctrl.LoggerFrom(ctx) newStatus := ms.Status.DeepCopy() @@ -962,6 +1062,9 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste readyReplicasCount := 0 availableReplicasCount := 0 desiredReplicas := *ms.Spec.Replicas + if !ms.DeletionTimestamp.IsZero() { + desiredReplicas = 0 + } templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated() for _, machine := range filteredMachines { @@ -1045,6 +1148,31 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste return nil } +func shouldRequeueForReplicaCountersRefresh(s *scope) ctrl.Result { + replicas := ptr.Deref(s.machineSet.Spec.Replicas, 0) + + // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. + // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. + // A replica is available if the amount of time since last transition exceeds MinReadySeconds. + // If there was a clock skew, checking whether the amount of time since last transition to ready state + // exceeds MinReadySeconds could be incorrect. + // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, + // at which point it should confirm any available replica to be available. + if s.machineSet.Spec.MinReadySeconds > 0 && + s.machineSet.Status.ReadyReplicas == replicas && + s.machineSet.Status.AvailableReplicas != replicas { + minReadyResult := ctrl.Result{RequeueAfter: time.Duration(s.machineSet.Spec.MinReadySeconds) * time.Second} + return minReadyResult + } + + // Quickly reconcile until the nodes become Ready. + if s.machineSet.Status.ReadyReplicas != replicas { + return ctrl.Result{RequeueAfter: 15 * time.Second} + } + + return ctrl.Result{} +} + func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (*corev1.Node, error) { remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { @@ -1057,7 +1185,15 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus return node, nil } -func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { +func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (ctrl.Result, error) { + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + + cluster := s.cluster + ms := s.machineSet + filteredMachines := s.machines + owner := s.owningMachineDeployment log := ctrl.LoggerFrom(ctx) // Calculate how many in flight machines we should remediate. @@ -1066,12 +1202,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl // If the MachineSet is part of a MachineDeployment, only allow remediations if // it's the desired revision. - if r.isDeploymentChild(ms) { - owner, err := r.getOwnerMachineDeployment(ctx, ms) - if err != nil { - return ctrl.Result{}, err - } - + if isDeploymentChild(ms) { if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { // MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed. return ctrl.Result{}, nil @@ -1185,38 +1316,40 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl return ctrl.Result{}, nil } -func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, ref *corev1.ObjectReference) error { +func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, owner *clusterv1.MachineDeployment, ref *corev1.ObjectReference) (objectNotFound bool, err error) { if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { - return nil + return false, nil } if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil { - return err + return false, err } obj, err := external.Get(ctx, r.Client, ref, cluster.Namespace) if err != nil { if apierrors.IsNotFound(err) { - if _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel]; !ok { + if !ms.DeletionTimestamp.IsZero() { + // Tolerate object not found when the machineSet is being deleted. + return true, nil + } + + if owner == nil { // If the MachineSet is not in a MachineDeployment, return the error immediately. - return err + return true, err } // When the MachineSet is part of a MachineDeployment but isn't the current revision, we should // ignore the not found references and allow the controller to proceed. - owner, err := r.getOwnerMachineDeployment(ctx, ms) - if err != nil { - return err - } - if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { - return nil + if !isCurrentMachineSet(ms, owner) { + return true, nil } + return true, err } - return err + return false, err } patchHelper, err := patch.NewHelper(obj, r.Client) if err != nil { - return err + return false, err } obj.SetOwnerReferences(util.EnsureOwnerRef(obj.GetOwnerReferences(), metav1.OwnerReference{ @@ -1226,5 +1359,5 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu UID: cluster.UID, })) - return patchHelper.Patch(ctx, obj) + return false, patchHelper.Patch(ctx, obj) } diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go new file mode 100644 index 000000000000..3c94d8a03767 --- /dev/null +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -0,0 +1,326 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machineset + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" + clog "sigs.k8s.io/cluster-api/util/log" +) + +// reconcileV1Beta2Status reconciles MachineSet's status during the entire lifecycle of the MachineSet. +// Additionally, this func should ensure that the conditions managed by this controller are always set in order to +// comply with the recommendation in the Kubernetes API guidelines. +// Note: v1beta1 conditions are not managed by this func. +func (r *Reconciler) reconcileV1Beta2Status(ctx context.Context, s *scope) { + // Update the following fields in status from the machines list. + // - v1beta2.readyReplicas + // - v1beta2.availableReplicas + // - v1beta2.upToDateReplicas + setReplicas(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // Conditions + + // Update the ScalingUp and ScalingDown condition. + setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded) + setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // MachinesReady condition: aggregate the Machine's Ready condition. + setMachinesReadyCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // MachinesUpToDate condition: aggregate the Machine's UpToDate condition. + setMachinesUpToDateCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // TODO Deleting +} + +func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // Return early when getAndAdoptMachinesForMachineSetSucceeded is false because it's not possible to calculate replica counters. + if !getAndAdoptMachinesForMachineSetSucceeded { + return + } + + var readyReplicas, availableReplicas, upToDateReplicas int32 + for _, machine := range machines { + if v1beta2conditions.IsTrue(machine, clusterv1.MachineReadyV1Beta2Condition) { + readyReplicas++ + } + if v1beta2conditions.IsTrue(machine, clusterv1.MachineAvailableV1Beta2Condition) { + availableReplicas++ + } + if v1beta2conditions.IsTrue(machine, clusterv1.MachineUpToDateV1Beta2Condition) { + upToDateReplicas++ + } + } + + if ms.Status.V1Beta2 == nil { + ms.Status.V1Beta2 = &clusterv1.MachineSetV1Beta2Status{} + } + + ms.Status.V1Beta2.ReadyReplicas = ptr.To(readyReplicas) + ms.Status.V1Beta2.AvailableReplicas = ptr.To(availableReplicas) + ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas) +} + +func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingUpInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + // Surface if .spec.replicas is not yet set (this should never happen). + if ms.Spec.Replicas == nil { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason, + }) + return + } + + desiredReplicas := *ms.Spec.Replicas + if !ms.DeletionTimestamp.IsZero() { + desiredReplicas = 0 + } + currentReplicas := int32(len(machines)) + + missingReferencesMessage := calculateMissingReferencesMessage(ms, bootstrapObjectNotFound, infrastructureObjectNotFound) + + if currentReplicas >= desiredReplicas { + var message string + if missingReferencesMessage != "" { + message = fmt.Sprintf("Scaling up would be blocked %s", missingReferencesMessage) + } + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + Message: message, + }) + return + } + + // Scaling up. + message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas) + if missingReferencesMessage != "" { + message += fmt.Sprintf(" is blocked %s", missingReferencesMessage) + } + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingUpV1Beta2Reason, + Message: message, + }) +} + +func setScalingDownCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingDownInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + // Surface if .spec.replicas is not yet set (this should never happen). + if ms.Spec.Replicas == nil { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingDownWaitingForReplicasSetV1Beta2Reason, + }) + return + } + + desiredReplicas := *ms.Spec.Replicas + if !ms.DeletionTimestamp.IsZero() { + desiredReplicas = 0 + } + currentReplicas := int32(len(machines)) + + // Scaling down. + if currentReplicas > desiredReplicas { + message := fmt.Sprintf("Scaling down from %d to %d replicas", len(machines), desiredReplicas) + staleMessage := aggregateStaleMachines(machines) + if staleMessage != "" { + message += fmt.Sprintf(" and %s", staleMessage) + } + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: message, + }) + return + } + + // Not scaling down. + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingDownV1Beta2Reason, + }) +} + +func setMachinesReadyCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + log := ctrl.LoggerFrom(ctx) + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesReadyInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + if len(machines) == 0 { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesReadyNoReplicasV1Beta2Reason, + }) + return + } + + readyCondition, err := v1beta2conditions.NewAggregateCondition( + machines, clusterv1.MachineReadyV1Beta2Condition, + v1beta2conditions.TargetConditionType(clusterv1.MachineSetMachinesReadyV1Beta2Condition), + ) + if err != nil { + log.Error(err, "Failed to aggregate Machine's Ready conditions") + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesReadyInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + v1beta2conditions.Set(machineSet, *readyCondition) +} + +func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + log := ctrl.LoggerFrom(ctx) + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesUpToDateInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + if len(machines) == 0 { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesUpToDateNoReplicasV1Beta2Reason, + }) + return + } + + upToDateCondition, err := v1beta2conditions.NewAggregateCondition( + machines, clusterv1.MachineUpToDateV1Beta2Condition, + v1beta2conditions.TargetConditionType(clusterv1.MachineSetMachinesUpToDateV1Beta2Condition), + ) + if err != nil { + log.Error(err, "Failed to aggregate Machine's UpToDate conditions") + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesUpToDateInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + v1beta2conditions.Set(machineSet, *upToDateCondition) +} + +func calculateMissingReferencesMessage(ms *clusterv1.MachineSet, bootstrapTemplateNotFound, infraMachineTemplateNotFound bool) string { + missingObjects := []string{} + if bootstrapTemplateNotFound { + missingObjects = append(missingObjects, ms.Spec.Template.Spec.Bootstrap.ConfigRef.Kind) + } + if infraMachineTemplateNotFound { + missingObjects = append(missingObjects, ms.Spec.Template.Spec.InfrastructureRef.Kind) + } + + if len(missingObjects) == 0 { + return "" + } + + if len(missingObjects) == 1 { + return fmt.Sprintf("because %s does not exist", missingObjects[0]) + } + + return fmt.Sprintf("because %s do not exist", strings.Join(missingObjects, " and ")) +} + +func aggregateStaleMachines(machines []*clusterv1.Machine) string { + machineNames := []string{} + for _, machine := range machines { + if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 { + machineNames = append(machineNames, machine.GetName()) + } + } + + if len(machineNames) == 0 { + return "" + } + + message := "Machine" + if len(machineNames) > 1 { + message += "s" + } + + sort.Strings(machineNames) + message += " " + clog.ListToString(machineNames, func(s string) string { return s }, 3) + + if len(machineNames) == 1 { + message += " is " + } else { + message += " are " + } + message += "in deletion since more than 30m" + + return message +} diff --git a/internal/controllers/machineset/machineset_controller_status_test.go b/internal/controllers/machineset/machineset_controller_status_test.go new file mode 100644 index 000000000000..bb5fb48751e3 --- /dev/null +++ b/internal/controllers/machineset/machineset_controller_status_test.go @@ -0,0 +1,775 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machineset + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" +) + +func Test_setReplicas(t *testing.T) { + tests := []struct { + name string + machines []*clusterv1.Machine + getAndAdoptMachinesForMachineSetSucceeded bool + expectedStatus *clusterv1.MachineSetV1Beta2Status + }{ + { + name: "getAndAdoptMachines failed", + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectedStatus: nil, + }, + { + name: "no machines", + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectedStatus: &clusterv1.MachineSetV1Beta2Status{ + ReadyReplicas: ptr.To[int32](0), + AvailableReplicas: ptr.To[int32](0), + UpToDateReplicas: ptr.To[int32](0), + }}, + { + name: "should count only ready machines", + machines: []*clusterv1.Machine{ + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionFalse, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + }}}}}, + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectedStatus: &clusterv1.MachineSetV1Beta2Status{ + ReadyReplicas: ptr.To[int32](1), + AvailableReplicas: ptr.To[int32](0), + UpToDateReplicas: ptr.To[int32](0), + }, + }, + { + name: "should count only available machines", + machines: []*clusterv1.Machine{ + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineAvailableV1Beta2Condition, + Status: metav1.ConditionUnknown, + }}}}}, + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectedStatus: &clusterv1.MachineSetV1Beta2Status{ + ReadyReplicas: ptr.To[int32](0), + AvailableReplicas: ptr.To[int32](1), + UpToDateReplicas: ptr.To[int32](0), + }, + }, + { + name: "should count only up-to-date machines", + machines: []*clusterv1.Machine{ + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + }}}}}, + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + }}}}}, + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectedStatus: &clusterv1.MachineSetV1Beta2Status{ + ReadyReplicas: ptr.To[int32](0), + AvailableReplicas: ptr.To[int32](0), + UpToDateReplicas: ptr.To[int32](1), + }, + }, + { + name: "should count all conditions from a machine", + machines: []*clusterv1.Machine{ + {Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + }, + { + Type: clusterv1.MachineAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + }, + { + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + }, + }}}}, + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectedStatus: &clusterv1.MachineSetV1Beta2Status{ + ReadyReplicas: ptr.To[int32](1), + AvailableReplicas: ptr.To[int32](1), + UpToDateReplicas: ptr.To[int32](1), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + ms := &clusterv1.MachineSet{} + setReplicas(ctx, ms, tt.machines, tt.getAndAdoptMachinesForMachineSetSucceeded) + g.Expect(ms.Status.V1Beta2).To(BeEquivalentTo(tt.expectedStatus)) + }) + } +} + +func Test_setScalingUpCondition(t *testing.T) { + defaultMachineSet := &clusterv1.MachineSet{ + Spec: clusterv1.MachineSetSpec{ + Replicas: ptr.To[int32](0), + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Bootstrap: clusterv1.Bootstrap{ + ConfigRef: &corev1.ObjectReference{ + Kind: "KubeadmBootstrapTemplate", + Namespace: "some-namespace", + Name: "some-name", + }, + }, + InfrastructureRef: corev1.ObjectReference{ + Kind: "DockerMachineTemplate", + Namespace: "some-namespace", + Name: "some-name", + }, + }, + }, + }, + } + + scalingUpMachineSetWith3Replicas := defaultMachineSet.DeepCopy() + scalingUpMachineSetWith3Replicas.Spec.Replicas = ptr.To[int32](3) + + deletingMachineSetWith3Replicas := defaultMachineSet.DeepCopy() + deletingMachineSetWith3Replicas.DeletionTimestamp = ptr.To(metav1.Now()) + deletingMachineSetWith3Replicas.Spec.Replicas = ptr.To[int32](3) + + tests := []struct { + name string + ms *clusterv1.MachineSet + machines []*clusterv1.Machine + bootstrapObjectNotFound bool + infrastructureObjectNotFound bool + getAndAdoptMachinesForMachineSetSucceeded bool + expectCondition metav1.Condition + }{ + { + name: "getAndAdoptMachines failed", + ms: defaultMachineSet, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingUpInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }, + }, + { + name: "not scaling up and no machines", + ms: defaultMachineSet, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + }, + }, + { + name: "not scaling up and no machines and bootstrapConfig object not found", + ms: defaultMachineSet, + bootstrapObjectNotFound: true, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + Message: "Scaling up would be blocked because KubeadmBootstrapTemplate does not exist", + }, + }, + { + name: "not scaling up and no machines and infrastructure object not found", + ms: defaultMachineSet, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: true, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + Message: "Scaling up would be blocked because DockerMachineTemplate does not exist", + }, + }, + { + name: "not scaling up and no machines and bootstrapConfig and infrastructure object not found", + ms: defaultMachineSet, + bootstrapObjectNotFound: true, + infrastructureObjectNotFound: true, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + Message: "Scaling up would be blocked because KubeadmBootstrapTemplate and DockerMachineTemplate do not exist", + }, + }, + { + name: "scaling up", + ms: scalingUpMachineSetWith3Replicas, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingUpV1Beta2Reason, + Message: "Scaling up from 0 to 3 replicas", + }, + }, + { + name: "scaling up and blocked by bootstrap object", + ms: scalingUpMachineSetWith3Replicas, + bootstrapObjectNotFound: true, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingUpV1Beta2Reason, + Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate does not exist", + }, + }, + { + name: "scaling up and blocked by infrastructure object", + ms: scalingUpMachineSetWith3Replicas, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: true, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingUpV1Beta2Reason, + Message: "Scaling up from 0 to 3 replicas is blocked because DockerMachineTemplate does not exist", + }, + }, + { + name: "deleting", + ms: deletingMachineSetWith3Replicas, + machines: []*clusterv1.Machine{{}, {}, {}}, + bootstrapObjectNotFound: false, + infrastructureObjectNotFound: false, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded) + + condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingUpV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(*condition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + +func Test_setScalingDownCondition(t *testing.T) { + machineSet := &clusterv1.MachineSet{ + Spec: clusterv1.MachineSetSpec{ + Replicas: ptr.To[int32](0), + }, + } + + machineSet1Replica := machineSet.DeepCopy() + machineSet1Replica.Spec.Replicas = ptr.To[int32](1) + + deletingMachineSet := machineSet.DeepCopy() + deletingMachineSet.Spec.Replicas = ptr.To[int32](1) + deletingMachineSet.DeletionTimestamp = ptr.To(metav1.Now()) + + tests := []struct { + name string + ms *clusterv1.MachineSet + machines []*clusterv1.Machine + getAndAdoptMachinesForMachineSetSucceeded bool + expectCondition metav1.Condition + }{ + { + name: "getAndAdoptMachines failed", + ms: machineSet, + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingDownInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }, + }, + { + name: "not scaling down and no machines", + ms: machineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingDownV1Beta2Reason, + }, + }, + { + name: "not scaling down because scaling up", + ms: machineSet1Replica, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingDownV1Beta2Reason, + }, + }, + { + name: "scaling down to zero", + ms: machineSet, + machines: []*clusterv1.Machine{ + newMachine("machine-1"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 1 to 0 replicas", + }, + }, + { + name: "scaling down with 1 stale machine", + ms: machineSet1Replica, + machines: []*clusterv1.Machine{ + newStaleDeletingMachine("stale-machine-1"), + newMachine("machine-2"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 2 to 1 replicas and Machine stale-machine-1 is in deletion since more than 30m", + }, + }, + { + name: "scaling down with 3 stale machines", + ms: machineSet1Replica, + machines: []*clusterv1.Machine{ + newStaleDeletingMachine("stale-machine-2"), + newStaleDeletingMachine("stale-machine-1"), + newStaleDeletingMachine("stale-machine-3"), + newMachine("machine-4"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 4 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 30m", + }, + }, + { + name: "scaling down with 5 stale machines", + ms: machineSet1Replica, + machines: []*clusterv1.Machine{ + newStaleDeletingMachine("stale-machine-5"), + newStaleDeletingMachine("stale-machine-4"), + newStaleDeletingMachine("stale-machine-2"), + newStaleDeletingMachine("stale-machine-3"), + newStaleDeletingMachine("stale-machine-1"), + newMachine("machine-6"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 6 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 30m", + }, + }, + { + name: "deleting machineset without replicas", + ms: deletingMachineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingDownV1Beta2Reason, + }, + }, + { + name: "deleting machineset having 1 replica", + ms: deletingMachineSet, + machines: []*clusterv1.Machine{ + newMachine("machine-1"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 1 to 0 replicas", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + setScalingDownCondition(ctx, tt.ms, tt.machines, tt.getAndAdoptMachinesForMachineSetSucceeded) + + condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingDownV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(*condition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + +func Test_setMachinesReadyCondition(t *testing.T) { + machineSet := &clusterv1.MachineSet{} + + readyCondition := metav1.Condition{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: v1beta2conditions.MultipleInfoReportedReason, + } + + tests := []struct { + name string + machineSet *clusterv1.MachineSet + machines []*clusterv1.Machine + getAndAdoptMachinesForMachineSetSucceeded bool + expectCondition metav1.Condition + }{ + { + name: "getAndAdoptMachines failed", + machineSet: machineSet, + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesReadyInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }, + }, + { + name: "no machines", + machineSet: machineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesReadyNoReplicasV1Beta2Reason, + }, + }, + { + name: "all machines are ready", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("machine-1", readyCondition), + newMachine("machine-2", readyCondition), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: v1beta2conditions.MultipleInfoReportedReason, + }, + }, + { + name: "one ready, one has nothing reported", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("machine-1", readyCondition), + newMachine("machine-2"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: v1beta2conditions.NotYetReportedReason, + Message: "Condition Ready not yet reported from Machine machine-2", + }, + }, + { + name: "one ready, one reporting not ready, one reporting unknown, one reporting deleting", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("machine-1", readyCondition), + newMachine("machine-2", metav1.Condition{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: "SomeReason", + Message: "HealthCheckSucceeded: Some message", + }), + newMachine("machine-3", metav1.Condition{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: "SomeUnknownReason", + Message: "Some unknown message", + }), + newMachine("machine-4", metav1.Condition{ + Type: clusterv1.MachineReadyV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineDeletingV1Beta2Reason, + Message: "Deleting: Machine deletion in progress, stage: DrainingNode", + }), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: v1beta2conditions.MultipleIssuesReportedReason, + Message: "Deleting: Machine deletion in progress, stage: DrainingNode from Machine machine-4; HealthCheckSucceeded: Some message from Machine machine-2; Some unknown message from Machine machine-3", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + setMachinesReadyCondition(ctx, tt.machineSet, tt.machines, tt.getAndAdoptMachinesForMachineSetSucceeded) + + condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetMachinesReadyV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(*condition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + +func Test_setMachinesUpToDateCondition(t *testing.T) { + machineSet := &clusterv1.MachineSet{} + + tests := []struct { + name string + machineSet *clusterv1.MachineSet + machines []*clusterv1.Machine + getAndAdoptMachinesForMachineSetSucceeded bool + expectCondition metav1.Condition + }{ + { + name: "getAndAdoptMachines failed", + machineSet: machineSet, + machines: nil, + getAndAdoptMachinesForMachineSetSucceeded: false, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesUpToDateInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }, + }, + { + name: "no machines", + machineSet: machineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesUpToDateNoReplicasV1Beta2Reason, + Message: "", + }, + }, + { + name: "One machine up-to-date", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("up-to-date-1", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: "some-reason-1", + }), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: "some-reason-1", + Message: "", + }, + }, + { + name: "One machine unknown", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("unknown-1", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: "some-unknown-reason-1", + Message: "some unknown message", + }), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: "some-unknown-reason-1", + Message: "some unknown message from Machine unknown-1", + }, + }, + { + name: "One machine not up-to-date", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("not-up-to-date-machine-1", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: "some-not-up-to-date-reason", + Message: "some not up-to-date message", + }), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: "some-not-up-to-date-reason", + Message: "some not up-to-date message from Machine not-up-to-date-machine-1", + }, + }, + { + name: "One machine without up-to-date condition", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("no-condition-machine-1"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: v1beta2conditions.NotYetReportedReason, + Message: "Condition UpToDate not yet reported from Machine no-condition-machine-1", + }, + }, + { + name: "Two machines not up-to-date, two up-to-date, two not reported", + machineSet: machineSet, + machines: []*clusterv1.Machine{ + newMachine("up-to-date-1", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: "TestUpToDate", + }), + newMachine("up-to-date-2", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: "TestUpToDate", + }), + newMachine("not-up-to-date-machine-1", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: "TestNotUpToDate", + Message: "This is not up-to-date message", + }), + newMachine("not-up-to-date-machine-2", metav1.Condition{ + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: "TestNotUpToDate", + Message: "This is not up-to-date message", + }), + newMachine("no-condition-machine-1"), + newMachine("no-condition-machine-2"), + }, + getAndAdoptMachinesForMachineSetSucceeded: true, + expectCondition: metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: v1beta2conditions.MultipleIssuesReportedReason, + Message: "This is not up-to-date message from Machines not-up-to-date-machine-1, not-up-to-date-machine-2; Condition UpToDate not yet reported from Machines no-condition-machine-1, no-condition-machine-2", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + setMachinesUpToDateCondition(ctx, tt.machineSet, tt.machines, tt.getAndAdoptMachinesForMachineSetSucceeded) + + condition := v1beta2conditions.Get(tt.machineSet, clusterv1.MachineSetMachinesUpToDateV1Beta2Condition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(*condition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + +func newMachine(name string, conditions ...metav1.Condition) *clusterv1.Machine { + m := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: metav1.NamespaceDefault, + }, + } + + for _, condition := range conditions { + v1beta2conditions.Set(m, condition) + } + + return m +} + +func newStaleDeletingMachine(name string) *clusterv1.Machine { + m := newMachine(name) + m.DeletionTimestamp = ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)}) + return m +} diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index a9b1791c6792..6f0aa23a9817 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -589,6 +589,7 @@ func TestMachineSetReconcile(t *testing.T) { }, Spec: clusterv1.MachineSetSpec{ ClusterName: testClusterName, + Replicas: ptr.To[int32](0), }, Status: clusterv1.MachineSetStatus{ V1Beta2: &clusterv1.MachineSetV1Beta2Status{Conditions: []metav1.Condition{{ @@ -616,8 +617,10 @@ func TestMachineSetReconcile(t *testing.T) { g := NewWithT(t) ms := newMachineSet("machineset1", testClusterName, int32(0)) - ms.Spec.Selector.MatchLabels = map[string]string{ - "--$-invalid": "true", + ms.Spec.Template.Spec.Bootstrap.ConfigRef = &corev1.ObjectReference{ + Kind: "FooTemplate", + Namespace: ms.GetNamespace(), + Name: "doesnotexist", } request := reconcile.Request{ @@ -1058,8 +1061,14 @@ func TestMachineSetReconciler_updateStatusResizedCondition(t *testing.T) { Client: c, recorder: record.NewFakeRecorder(32), } - err := msr.updateStatus(ctx, cluster, tc.machineSet, tc.machines) - g.Expect(err).ToNot(HaveOccurred()) + s := &scope{ + cluster: cluster, + machineSet: tc.machineSet, + machines: tc.machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + setReplicas(ctx, s.machineSet, s.machines, tc.machines != nil) + g.Expect(msr.reconcileStatus(ctx, s)).To(Succeed()) gotCond := conditions.Get(tc.machineSet, clusterv1.ResizedCondition) g.Expect(gotCond).ToNot(BeNil()) g.Expect(gotCond.Status).To(Equal(corev1.ConditionFalse)) @@ -1296,7 +1305,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { Client: env, ssaCache: ssa.NewCache(), } - g.Expect(reconciler.syncMachines(ctx, ms, machines)).To(Succeed()) + s := &scope{ + machineSet: ms, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) // The inPlaceMutatingMachine should have cleaned up managed fields. updatedInPlaceMutatingMachine := inPlaceMutatingMachine.DeepCopy() @@ -1372,7 +1387,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { ms.Spec.Template.Spec.NodeDrainTimeout = duration10s ms.Spec.Template.Spec.NodeDeletionTimeout = duration10s ms.Spec.Template.Spec.NodeVolumeDetachTimeout = duration10s - g.Expect(reconciler.syncMachines(ctx, ms, []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine})).To(Succeed()) + s = &scope{ + machineSet: ms, + machines: []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) // Verify in-place mutable fields are updated on the Machine. updatedInPlaceMutatingMachine = inPlaceMutatingMachine.DeepCopy() @@ -1452,7 +1473,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { ms.Spec.Template.Spec.NodeDrainTimeout = duration11s ms.Spec.Template.Spec.NodeDeletionTimeout = duration11s ms.Spec.Template.Spec.NodeVolumeDetachTimeout = duration11s - g.Expect(reconciler.syncMachines(ctx, ms, []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine})).To(Succeed()) + s = &scope{ + machineSet: ms, + machines: []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) updatedDeletingMachine := deletingMachine.DeepCopy() g.Expect(env.GetAPIReader().Get(ctx, client.ObjectKeyFromObject(updatedDeletingMachine), updatedDeletingMachine)).To(Succeed()) @@ -1519,7 +1546,15 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine is deleted. m := &clusterv1.Machine{} @@ -1577,7 +1612,13 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine has the updated condition. @@ -1683,8 +1724,16 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Client: fakeClient, } + s := &scope{ + cluster: cluster, + machineSet: machineSetOld, + machines: machines, + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + // Test first with the old MachineSet. - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSetOld, machines) + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) condition := clusterv1.MachineOwnerRemediatedCondition @@ -1707,7 +1756,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { To(BeFalse(), "Machine should not have the %s condition set", condition) // Test with the current MachineSet. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSetCurrent, machines) + s = &scope{ + cluster: cluster, + machineSet: machineSetCurrent, + machines: machines, + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine has been deleted. @@ -1814,7 +1870,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // // First pass. // - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, append(unhealthyMachines, healthyMachine)) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: append(unhealthyMachines, healthyMachine), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) condition := clusterv1.MachineOwnerRemediatedCondition @@ -1865,7 +1928,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { return } - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) var validateSecondPass = func(cleanFinalizer bool) { @@ -1912,7 +1982,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Perform another pass with the same exact configuration. // This is testing that, given that we have Machines that are being deleted and are in flight, // we have reached the maximum amount of tokens we have and we should wait to remediate the rest. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Validate and remove finalizers for in flight machines. @@ -1926,7 +2003,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Call again to verify that the remaining unhealthy machines are deleted, // at this point all unhealthy machines should be deleted given the max in flight // is greater than the number of unhealthy machines. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Iterate over the unhealthy machines and verify that all were deleted. @@ -1976,7 +2060,13 @@ func TestMachineSetReconciler_syncReplicas(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - result, err := r.syncReplicas(ctx, cluster, machineSet, nil) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + result, err := r.syncReplicas(ctx, s) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result.IsZero()).To(BeFalse(), "syncReplicas should not return a 'zero' result") @@ -2217,7 +2307,14 @@ func TestReconciler_reconcileDelete(t *testing.T) { recorder: record.NewFakeRecorder(32), } - err := r.reconcileDelete(ctx, tt.machineSet) + s := &scope{ + machineSet: tt.machineSet, + } + + // populate s.machines + _, err := r.getAndAdoptMachinesForMachineSet(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) + _, err = r.reconcileDelete(ctx, s) if tt.expectError { g.Expect(err).To(HaveOccurred()) } else { diff --git a/util/log/log.go b/util/log/log.go index 11a41c460361..5f7432e51f0a 100644 --- a/util/log/log.go +++ b/util/log/log.go @@ -107,19 +107,16 @@ func getOwners(ctx context.Context, c client.Client, obj metav1.Object) ([]owner return owners, nil } -// ObjNamesString returns a comma separated list of the object names, limited to -// five objects. On more than five objects it outputs the first five objects and -// adds information about how much more are in the given list. -func ObjNamesString[T client.Object](objs []T) string { +// ListToString returns a comma-separated list of the first n entries of the list (strings are calculated via stringFunc). +func ListToString[T any](list []T, stringFunc func(T) string, n int) string { shortenedBy := 0 - if len(objs) > 5 { - shortenedBy = len(objs) - 5 - objs = objs[:5] + if len(list) > n { + shortenedBy = len(list) - n + list = list[:n] } - stringList := []string{} - for _, obj := range objs { - stringList = append(stringList, obj.GetName()) + for _, p := range list { + stringList = append(stringList, stringFunc(p)) } if shortenedBy > 0 { @@ -128,3 +125,21 @@ func ObjNamesString[T client.Object](objs []T) string { return strings.Join(stringList, ", ") } + +// StringListToString returns a comma separated list of the strings, limited to +// five objects. On more than five objects it outputs the first five objects and +// adds information about how much more are in the given list. +func StringListToString(objs []string) string { + return ListToString(objs, func(s string) string { + return s + }, 5) +} + +// ObjNamesString returns a comma separated list of the object names, limited to +// five objects. On more than five objects it outputs the first five objects and +// adds information about how much more are in the given list. +func ObjNamesString[T client.Object](objs []T) string { + return ListToString(objs, func(obj T) string { + return obj.GetName() + }, 5) +}