diff --git a/api/v1beta1/machineset_types.go b/api/v1beta1/machineset_types.go index e7dc0afd4767..093c47d0c700 100644 --- a/api/v1beta1/machineset_types.go +++ b/api/v1beta1/machineset_types.go @@ -87,6 +87,81 @@ type MachineSetSpec struct { Template MachineTemplateSpec `json:"template,omitempty"` } +// MachineSet's ScalingUp condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineSetScalingUpV1Beta2Condition is true if actual replicas < desired replicas. + MachineSetScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition + + // MachineSetScalingUpV1Beta2Reason surfaces when actual replicas < desired replicas. + MachineSetScalingUpV1Beta2Reason = ScalingUpV1Beta2Reason + + // MachineSetNotScalingUpV1Beta2Reason surfaces when actual replicas >= desired replicas. + MachineSetNotScalingUpV1Beta2Reason = NotScalingUpV1Beta2Reason + + // MachineSetScalingUpInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines. + MachineSetScalingUpInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason + + // MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason surfaces when the .spec.replicas + // field of the MachineSet is not set. + MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason = WaitingForReplicasSetV1Beta2Reason +) + +// MachineSet's ScalingDown condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineSetScalingDownV1Beta2Condition is true if actual replicas > desired replicas. + MachineSetScalingDownV1Beta2Condition = ScalingDownV1Beta2Condition + + // MachineSetScalingDownV1Beta2Reason surfaces when actual replicas > desired replicas. + MachineSetScalingDownV1Beta2Reason = ScalingDownV1Beta2Reason + + // MachineSetNotScalingDownV1Beta2Reason surfaces when actual replicas <= desired replicas. + MachineSetNotScalingDownV1Beta2Reason = NotScalingDownV1Beta2Reason + + // MachineSetScalingDownInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines. + MachineSetScalingDownInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason + + // MachineSetScalingDownWaitingForReplicasToBeSetV1Beta2Reason surfaces when the .spec.replicas + // field of the MachineSet is not set. + MachineSetScalingDownWaitingForReplicasSetV1Beta2Reason = WaitingForReplicasSetV1Beta2Reason +) + +// MachineSet's MachinesReady condition and corresponding reasons that will be used in v1Beta2 API version. +// Note: Reason's could also be derived from the aggregation of machine's Ready conditions. +const ( + // MachineSetMachinesReadyV1Beta2Condition surfaces detail of issues on the controlled machines, if any. + MachineSetMachinesReadyV1Beta2Condition = MachinesReadyV1Beta2Condition + + // MachineSetMachinesReadyNoReplicasV1Beta2Reason surfaces when no machines exist for the MachineSet. + MachineSetMachinesReadyNoReplicasV1Beta2Reason = "NoReplicas" + + // MachineSetMachinesReadyInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines + // or aggregating machine's conditions. + MachineSetMachinesReadyInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason +) + +// MachineSet's MachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. +// Note: Reason's could also be derived from the aggregation of machine's MachinesUpToDate conditions. +const ( + // MachineSetMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. + MachineSetMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition + + // MachineSetMachinesUpToDateNoReplicasV1Beta2Reason surfaces when no machines exist for the MachineSet. + MachineSetMachinesUpToDateNoReplicasV1Beta2Reason = "NoReplicas" + + // MachineSetMachinesUpToDateInternalErrorV1Beta2Reason surfaces unexpected failures when listing machines + // or aggregating status. + MachineSetMachinesUpToDateInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason +) + +// Conditions that will be used for the MachineSet object in v1Beta2 API version. +const ( + // MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any. + MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition + + // MachineSetDeletingV1Beta2Condition surfaces details about ongoing deletion of the controlled machines. + MachineSetDeletingV1Beta2Condition = DeletingV1Beta2Condition +) + // ANCHOR_END: MachineSetSpec // ANCHOR: MachineTemplateSpec diff --git a/api/v1beta1/v1beta2_condition_consts.go b/api/v1beta1/v1beta2_condition_consts.go index 5a0f887e2a64..a9ba73ad1802 100644 --- a/api/v1beta1/v1beta2_condition_consts.go +++ b/api/v1beta1/v1beta2_condition_consts.go @@ -90,6 +90,21 @@ const ( // AvailableV1Beta2Reason applies to a condition surfacing object availability. AvailableV1Beta2Reason = "Available" + // ScalingUpV1Beta2Reason surfaces when an object is scaling up. + ScalingUpV1Beta2Reason = "ScalingUp" + + // NotScalingUpV1Beta2Reason surfaces when an object is not scaling up. + NotScalingUpV1Beta2Reason = "NotScalingUp" + + // ScalingDownV1Beta2Reason surfaces when an object is scaling down. + ScalingDownV1Beta2Reason = "ScalingDown" + + // NotScalingDownV1Beta2Reason surfaces when an object is not scaling down. + NotScalingDownV1Beta2Reason = "NotScalingDown" + + // WaitingForReplicasSetV1Beta2Reason surfaces when the repicla field of an object is not set. + WaitingForReplicasSetV1Beta2Reason = "WaitingForReplicasToBeSet" + // InvalidConditionReportedV1Beta2Reason applies to a condition, usually read from an external object, that is invalid // (e.g. its status is missing). InvalidConditionReportedV1Beta2Reason = "InvalidConditionReported" @@ -141,27 +156,6 @@ const ( DeletionCompletedV1Beta2Reason = "DeletionCompleted" ) -// Conditions that will be used for the MachineSet object in v1Beta2 API version. -const ( - // MachineSetMachinesReadyV1Beta2Condition surfaces detail of issues on the controlled machines, if any. - MachineSetMachinesReadyV1Beta2Condition = MachinesReadyV1Beta2Condition - - // MachineSetMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. - MachineSetMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition - - // MachineSetScalingUpV1Beta2Condition is true if available replicas < desired replicas. - MachineSetScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition - - // MachineSetScalingDownV1Beta2Condition is true if replicas > desired replicas. - MachineSetScalingDownV1Beta2Condition = ScalingDownV1Beta2Condition - - // MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any. - MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition - - // MachineSetDeletingV1Beta2Condition surfaces details about ongoing deletion of the controlled machines. - MachineSetDeletingV1Beta2Condition = DeletingV1Beta2Condition -) - // Conditions that will be used for the MachineDeployment object in v1Beta2 API version. const ( // MachineDeploymentAvailableV1Beta2Condition is true if the MachineDeployment is not deleted, and it has minimum diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 9babad1c13a3..9eb029ef43b4 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -328,7 +328,12 @@ func doReconcile(ctx context.Context, phases []machineReconcileFunc, s *scope) ( } res = util.LowestNonZeroResult(res, phaseResult) } - return res, kerrors.NewAggregate(errs) + + if len(errs) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errs) + } + + return res, nil } // scope holds the different objects that are read and used during the reconcile. diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index ec81faa40593..578dc978570f 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -140,7 +140,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt return nil } -func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retres ctrl.Result, reterr error) { machineSet := &clusterv1.MachineSet{} if err := r.Client.Get(ctx, req.NamespacedName, machineSet); err != nil { if apierrors.IsNotFound(err) { @@ -176,37 +176,119 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re return ctrl.Result{}, err } + s := &scope{ + cluster: cluster, + machineSet: machineSet, + } + // Initialize the patch helper - patchHelper, err := patch.NewHelper(machineSet, r.Client) + patchHelper, err := patch.NewHelper(s.machineSet, r.Client) if err != nil { return ctrl.Result{}, err } defer func() { + if err := r.reconcileStatus(ctx, s); err != nil { + reterr = kerrors.NewAggregate([]error{reterr, errors.Wrapf(err, "failed to update status")}) + } + + r.reconcileV1Beta2Status(ctx, s) + + // Adjust requeue when scaling up + if s.machineSet.DeletionTimestamp.IsZero() && reterr == nil { + retres = util.LowestNonZeroResult(retres, shouldRequeueForReplicaCountersRefresh(s)) + } + // Always attempt to patch the object and status after each reconciliation. - if err := patchMachineSet(ctx, patchHelper, machineSet); err != nil { + if err := patchMachineSet(ctx, patchHelper, s.machineSet); err != nil { reterr = kerrors.NewAggregate([]error{reterr, err}) } }() + if isDeploymentChild(s.machineSet) { + // If the MachineSet is in a MachineDeployment, try to get the owning MachineDeployment. + s.owningMachineDeployment, err = r.getOwnerMachineDeployment(ctx, s.machineSet) + if err != nil { + return ctrl.Result{}, err + } + } + + alwaysReconcile := []machineSetReconcileFunc{ + wrapErrMachineSetReconcileFunc(r.reconcileMachineSetOwnerAndLabels, "failed to set MachineSet owner and labels"), + wrapErrMachineSetReconcileFunc(r.reconcileInfrastructure, "failed to reconcile infrastructure"), + wrapErrMachineSetReconcileFunc(r.reconcileBootstrapConfig, "failed to reconcile bootstrapConfig"), + wrapErrMachineSetReconcileFunc(r.getAndAdoptMachinesForMachineSet, "failed to get and adopt Machines fo r MachineSet"), + } + // Handle deletion reconciliation loop. - if !machineSet.DeletionTimestamp.IsZero() { - return ctrl.Result{}, r.reconcileDelete(ctx, machineSet) + if !s.machineSet.DeletionTimestamp.IsZero() { + reconcileDelete := append( + alwaysReconcile, + wrapErrMachineSetReconcileFunc(r.reconcileDelete, "failed to reconcile delete"), + ) + return doReconcile(ctx, s, reconcileDelete) } - result, err := r.reconcile(ctx, cluster, machineSet) + reconcileNormal := append(alwaysReconcile, + wrapErrMachineSetReconcileFunc(r.reconcileUnhealthyMachines, "failed to reconcile unhealthy machines"), + wrapErrMachineSetReconcileFunc(r.syncMachines, "failed to sync machines"), + wrapErrMachineSetReconcileFunc(r.syncReplicas, "failed to sync replicas"), + ) + + result, err := doReconcile(ctx, s, reconcileNormal) if err != nil { // Requeue if the reconcile failed because connection to workload cluster was down. if errors.Is(err, clustercache.ErrClusterNotConnected) { log.V(5).Info("Requeuing because connection to the workload cluster is down") return ctrl.Result{RequeueAfter: time.Minute}, nil } - r.recorder.Eventf(machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) + r.recorder.Eventf(s.machineSet, corev1.EventTypeWarning, "ReconcileError", "%v", err) } return result, err } -func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet, options ...patch.Option) error { +type scope struct { + machineSet *clusterv1.MachineSet + cluster *clusterv1.Cluster + machines []*clusterv1.Machine + bootstrapObjectNotFound bool + infrastructureObjectNotFound bool + getAndAdoptMachinesForMachineSetSucceeded bool + owningMachineDeployment *clusterv1.MachineDeployment +} + +type machineSetReconcileFunc func(ctx context.Context, s *scope) (ctrl.Result, error) + +func wrapErrMachineSetReconcileFunc(f machineSetReconcileFunc, msg string) machineSetReconcileFunc { + return func(ctx context.Context, s *scope) (ctrl.Result, error) { + res, err := f(ctx, s) + return res, errors.Wrap(err, msg) + } +} + +func doReconcile(ctx context.Context, s *scope, phases []machineSetReconcileFunc) (ctrl.Result, error) { + res := ctrl.Result{} + errs := []error{} + for _, phase := range phases { + // Call the inner reconciliation methods. + phaseResult, err := phase(ctx, s) + if err != nil { + errs = append(errs, err) + } + if len(errs) > 0 { + continue + } + res = util.LowestNonZeroResult(res, phaseResult) + } + + if len(errs) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errs) + } + + return res, nil +} + +func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet *clusterv1.MachineSet) error { // Always update the readyCondition by summarizing the state of other conditions. conditions.SetSummary(machineSet, conditions.WithConditions( @@ -217,18 +299,28 @@ func patchMachineSet(ctx context.Context, patchHelper *patch.Helper, machineSet ) // Patch the object, ignoring conflicts on the conditions owned by this controller. - options = append(options, + options := []patch.Option{ patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ clusterv1.ReadyCondition, clusterv1.MachinesCreatedCondition, clusterv1.ResizedCondition, clusterv1.MachinesReadyCondition, }}, - ) + patch.WithOwnedV1Beta2Conditions{Conditions: []string{ + clusterv1.MachineSetScalingUpV1Beta2Condition, + clusterv1.MachineSetScalingDownV1Beta2Condition, + clusterv1.MachineSetMachinesReadyV1Beta2Condition, + clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + clusterv1.MachineSetRemediatingV1Beta2Condition, + clusterv1.MachineSetDeletingV1Beta2Condition, + }}, + } return patchHelper.Patch(ctx, machineSet, options...) } -func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet *clusterv1.MachineSet) (ctrl.Result, error) { +func (r *Reconciler) reconcileMachineSetOwnerAndLabels(_ context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + cluster := s.cluster // Reconcile and retrieve the Cluster object. if machineSet.Labels == nil { machineSet.Labels = make(map[string]string) @@ -246,17 +338,6 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, })) } - // Make sure to reconcile the external infrastructure reference. - if err := r.reconcileExternalTemplateReference(ctx, cluster, machineSet, &machineSet.Spec.Template.Spec.InfrastructureRef); err != nil { - return ctrl.Result{}, err - } - // Make sure to reconcile the external bootstrap reference, if any. - if machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { - if err := r.reconcileExternalTemplateReference(ctx, cluster, machineSet, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef); err != nil { - return ctrl.Result{}, err - } - } - // Make sure selector and template to be in the same cluster. if machineSet.Spec.Selector.MatchLabels == nil { machineSet.Spec.Selector.MatchLabels = make(map[string]string) @@ -269,97 +350,70 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, machineSet.Spec.Selector.MatchLabels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName machineSet.Spec.Template.Labels[clusterv1.ClusterNameLabel] = machineSet.Spec.ClusterName - machines, err := r.getAndAdoptMachinesForMachineSet(ctx, machineSet) - if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to list Machines") - } - - result := ctrl.Result{} + return ctrl.Result{}, nil +} - reconcileUnhealthyMachinesResult, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) +func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctrl.Result, error) { + cluster := s.cluster + machineSet := s.machineSet + // Make sure to reconcile the external infrastructure reference. + var err error + s.infrastructureObjectNotFound, err = r.reconcileExternalTemplateReference(ctx, cluster, machineSet, s.owningMachineDeployment, &machineSet.Spec.Template.Spec.InfrastructureRef) if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to reconcile unhealthy machines") - } - result = util.LowestNonZeroResult(result, reconcileUnhealthyMachinesResult) - - if err := r.syncMachines(ctx, machineSet, machines); err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") - } - - syncReplicasResult, syncErr := r.syncReplicas(ctx, cluster, machineSet, machines) - result = util.LowestNonZeroResult(result, syncReplicasResult) - - // Always updates status as machines come up or die. - if err := r.updateStatus(ctx, cluster, machineSet, machines); err != nil { - return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate([]error{err, syncErr}), "failed to update MachineSet's Status") - } - - if syncErr != nil { - return ctrl.Result{}, errors.Wrapf(syncErr, "failed to sync MachineSet replicas") - } - - var replicas int32 - if machineSet.Spec.Replicas != nil { - replicas = *machineSet.Spec.Replicas + return ctrl.Result{}, err } + return ctrl.Result{}, nil +} - // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. - // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. - // A replica is available if the amount of time since last transition exceeds MinReadySeconds. - // If there was a clock skew, checking whether the amount of time since last transition to ready state - // exceeds MinReadySeconds could be incorrect. - // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, - // at which point it should confirm any available replica to be available. - if machineSet.Spec.MinReadySeconds > 0 && - machineSet.Status.ReadyReplicas == replicas && - machineSet.Status.AvailableReplicas != replicas { - minReadyResult := ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second} - result = util.LowestNonZeroResult(result, minReadyResult) - return result, nil +func (r *Reconciler) reconcileBootstrapConfig(ctx context.Context, s *scope) (ctrl.Result, error) { + cluster := s.cluster + machineSet := s.machineSet + // Make sure to reconcile the external bootstrap reference, if any. + if s.machineSet.Spec.Template.Spec.Bootstrap.ConfigRef != nil { + var err error + s.bootstrapObjectNotFound, err = r.reconcileExternalTemplateReference(ctx, cluster, machineSet, s.owningMachineDeployment, machineSet.Spec.Template.Spec.Bootstrap.ConfigRef) + if err != nil { + return ctrl.Result{}, err + } } + return ctrl.Result{}, nil +} - // Quickly reconcile until the nodes become Ready. - if machineSet.Status.ReadyReplicas != replicas { - result = util.LowestNonZeroResult(result, ctrl.Result{RequeueAfter: 15 * time.Second}) - return result, nil +func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + machineList := s.machines + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil } - return result, nil -} - -func (r *Reconciler) reconcileDelete(ctx context.Context, machineSet *clusterv1.MachineSet) error { log := ctrl.LoggerFrom(ctx) - machineList, err := r.getAndAdoptMachinesForMachineSet(ctx, machineSet) - if err != nil { - return err - } // If all the descendant machines are deleted, then remove the machineset's finalizer. if len(machineList) == 0 { controllerutil.RemoveFinalizer(machineSet, clusterv1.MachineSetFinalizer) - return nil + return ctrl.Result{}, nil } - log.Info("Waiting for Machines to be deleted", "Machines", clog.ObjNamesString(machineList)) - // else delete owned machines. for _, machine := range machineList { if machine.DeletionTimestamp.IsZero() { log.Info("Deleting Machine", "Machine", klog.KObj(machine)) if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) { - return errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine)) } } } - return nil + log.Info("Waiting for Machines to be deleted", "Machines", clog.ObjNamesString(machineList)) + return ctrl.Result{}, nil } -func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machineSet *clusterv1.MachineSet) ([]*clusterv1.Machine, error) { +func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet log := ctrl.LoggerFrom(ctx) selectorMap, err := metav1.LabelSelectorAsMap(&machineSet.Spec.Selector) if err != nil { - return nil, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) + return ctrl.Result{}, errors.Wrapf(err, "failed to convert MachineSet %q label selector to a map", machineSet.Name) } // Get all Machines linked to this MachineSet. @@ -370,7 +424,7 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi client.MatchingLabels(selectorMap), ) if err != nil { - return nil, errors.Wrap(err, "failed to list machines") + return ctrl.Result{}, errors.Wrap(err, "failed to list machines") } // Filter out irrelevant machines (i.e. IsControlledBy something else) and claim orphaned machines. @@ -398,7 +452,10 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi filteredMachines = append(filteredMachines, machine) } - return filteredMachines, nil + s.machines = filteredMachines + s.getAndAdoptMachinesForMachineSetSucceeded = true + + return ctrl.Result{}, nil } // syncMachines updates Machines, InfrastructureMachine and BootstrapConfig to propagate in-place mutable fields @@ -409,7 +466,13 @@ func (r *Reconciler) getAndAdoptMachinesForMachineSet(ctx context.Context, machi // "metadata.annotations" from "manager" so that "capi-machineset" can own these fields and can work with SSA. // Otherwise fields would be co-owned by our "old" "manager" and "capi-machineset" and then we would not be // able to e.g. drop labels and annotations. -func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine) error { +func (r *Reconciler) syncMachines(ctx context.Context, s *scope) (ctrl.Result, error) { + machineSet := s.machineSet + machines := s.machines + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + log := ctrl.LoggerFrom(ctx) for i := range machines { m := machines[i] @@ -418,7 +481,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac if !m.DeletionTimestamp.IsZero() { patchHelper, err := patch.NewHelper(m, r.Client) if err != nil { - return err + return ctrl.Result{}, err } // Set all other in-place mutable fields that impact the ability to tear down existing machines. @@ -428,7 +491,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac m.Spec.NodeVolumeDetachTimeout = machineSet.Spec.Template.Spec.NodeVolumeDetachTimeout if err := patchHelper.Patch(ctx, m); err != nil { - return err + return ctrl.Result{}, err } continue } @@ -438,7 +501,7 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac // (< v1.4.0) can also work with SSA. Otherwise, fields would be co-owned by our "old" "manager" and // "capi-machineset" and then we would not be able to e.g. drop labels and annotations. if err := ssa.CleanUpManagedFieldsForSSAAdoption(ctx, r.Client, m, machineSetManagerName); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the Machine %q", m.Name) } // Update Machine to propagate in-place mutable fields from the MachineSet. @@ -446,13 +509,13 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac err := ssa.Patch(ctx, r.Client, machineSetManagerName, updatedMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: m}) if err != nil { log.Error(err, "Failed to update Machine", "Machine", klog.KObj(updatedMachine)) - return errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update Machine %q", klog.KObj(updatedMachine)) } machines[i] = updatedMachine infraMachine, err := external.Get(ctx, r.Client, &updatedMachine.Spec.InfrastructureRef, updatedMachine.Namespace) if err != nil { - return errors.Wrapf(err, "failed to get InfrastructureMachine %s", + return ctrl.Result{}, errors.Wrapf(err, "failed to get InfrastructureMachine %s", klog.KRef(updatedMachine.Spec.InfrastructureRef.Namespace, updatedMachine.Spec.InfrastructureRef.Name)) } // Cleanup managed fields of all InfrastructureMachines to drop ownership of labels and annotations @@ -464,17 +527,17 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac {"f:metadata", "f:labels"}, } if err := ssa.DropManagedFields(ctx, r.Client, infraMachine, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the InfrastructureMachine %s", klog.KObj(infraMachine)) } // Update in-place mutating fields on InfrastructureMachine. if err := r.updateExternalObject(ctx, infraMachine, machineSet); err != nil { - return errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update InfrastructureMachine %s", klog.KObj(infraMachine)) } if updatedMachine.Spec.Bootstrap.ConfigRef != nil { bootstrapConfig, err := external.Get(ctx, r.Client, updatedMachine.Spec.Bootstrap.ConfigRef, updatedMachine.Namespace) if err != nil { - return errors.Wrapf(err, "failed to get BootstrapConfig %s", + return ctrl.Result{}, errors.Wrapf(err, "failed to get BootstrapConfig %s", klog.KRef(updatedMachine.Spec.Bootstrap.ConfigRef.Namespace, updatedMachine.Spec.Bootstrap.ConfigRef.Name)) } // Cleanup managed fields of all BootstrapConfigs to drop ownership of labels and annotations @@ -482,22 +545,29 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac // can also work with SSA. Otherwise, labels and annotations would be co-owned by our "old" "manager" // and "capi-machineset" and then we would not be able to e.g. drop labels and annotations. if err := ssa.DropManagedFields(ctx, r.Client, bootstrapConfig, machineSetManagerName, labelsAndAnnotationsManagedFieldPaths); err != nil { - return errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update machine: failed to adjust the managedFields of the BootstrapConfig %s", klog.KObj(bootstrapConfig)) } // Update in-place mutating fields on BootstrapConfig. if err := r.updateExternalObject(ctx, bootstrapConfig, machineSet); err != nil { - return errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) + return ctrl.Result{}, errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(bootstrapConfig)) } } } - return nil + return ctrl.Result{}, nil } // syncReplicas scales Machine resources up or down. -func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) (ctrl.Result, error) { +func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, error) { + ms := s.machineSet + machines := s.machines + cluster := s.cluster + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + log := ctrl.LoggerFrom(ctx) if ms.Spec.Replicas == nil { - return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) + return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for MachineSet %v is nil, this should not be allowed", ms.Name) } diff := len(machines) - int(*(ms.Spec.Replicas)) switch { @@ -921,11 +991,19 @@ func (r *Reconciler) getMachineSetsForMachine(ctx context.Context, m *clusterv1. } // isDeploymentChild returns true if the MachineSet originated from a MachineDeployment by checking its labels. -func (r *Reconciler) isDeploymentChild(ms *clusterv1.MachineSet) bool { +func isDeploymentChild(ms *clusterv1.MachineSet) bool { _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel] return ok } +// isCurrentMachineSet returns true if the MachineSet's and MachineDeployments revision are equal. +func isCurrentMachineSet(ms *clusterv1.MachineSet, md *clusterv1.MachineDeployment) bool { + if md == nil { + return false + } + return md.Annotations[clusterv1.RevisionAnnotation] == ms.Annotations[clusterv1.RevisionAnnotation] +} + // shouldAdopt returns true if the MachineSet should be adopted as a stand-alone MachineSet directly owned by the Cluster. func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // if the MachineSet is controlled by a MachineDeployment, or if it is a stand-alone MachinesSet directly owned by the Cluster, then no-op. @@ -936,12 +1014,24 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { // If the MachineSet is originated by a MachineDeployment object, it should not be adopted directly by the Cluster as a stand-alone MachineSet. // Note: this is required because after restore from a backup both the MachineSet controller and the // MachineDeployment controller are racing to adopt MachineSets, see https://github.com/kubernetes-sigs/cluster-api/issues/7529 - return !r.isDeploymentChild(ms) + return !isDeploymentChild(ms) } -// updateStatus updates the Status field for the MachineSet +// reconcileStatus updates the Status field for the MachineSet // It checks for the current state of the replicas and updates the Status of the MachineSet. -func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { +func (r *Reconciler) reconcileStatus(ctx context.Context, s *scope) error { + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return nil + } + + ms := s.machineSet + filteredMachines := s.machines + cluster := s.cluster + + if ms.Spec.Replicas == nil { + return errors.New("cannot update status when MachineSet .spec.replicas is not set") + } + log := ctrl.LoggerFrom(ctx) newStatus := ms.Status.DeepCopy() @@ -1045,6 +1135,31 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste return nil } +func shouldRequeueForReplicaCountersRefresh(s *scope) ctrl.Result { + replicas := ptr.Deref(s.machineSet.Spec.Replicas, 0) + + // Resync the MachineSet after MinReadySeconds as a last line of defense to guard against clock-skew. + // Clock-skew is an issue as it may impact whether an available replica is counted as a ready replica. + // A replica is available if the amount of time since last transition exceeds MinReadySeconds. + // If there was a clock skew, checking whether the amount of time since last transition to ready state + // exceeds MinReadySeconds could be incorrect. + // To avoid an available replica stuck in the ready state, we force a reconcile after MinReadySeconds, + // at which point it should confirm any available replica to be available. + if s.machineSet.Spec.MinReadySeconds > 0 && + s.machineSet.Status.ReadyReplicas == replicas && + s.machineSet.Status.AvailableReplicas != replicas { + minReadyResult := ctrl.Result{RequeueAfter: time.Duration(s.machineSet.Spec.MinReadySeconds) * time.Second} + return minReadyResult + } + + // Quickly reconcile until the nodes become Ready. + if s.machineSet.Status.ReadyReplicas != replicas { + return ctrl.Result{RequeueAfter: 15 * time.Second} + } + + return ctrl.Result{} +} + func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) (*corev1.Node, error) { remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { @@ -1057,7 +1172,15 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus return node, nil } -func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) (ctrl.Result, error) { +func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (ctrl.Result, error) { + if !s.getAndAdoptMachinesForMachineSetSucceeded { + return ctrl.Result{}, nil + } + + cluster := s.cluster + ms := s.machineSet + filteredMachines := s.machines + owner := s.owningMachineDeployment log := ctrl.LoggerFrom(ctx) // Calculate how many in flight machines we should remediate. @@ -1066,12 +1189,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl // If the MachineSet is part of a MachineDeployment, only allow remediations if // it's the desired revision. - if r.isDeploymentChild(ms) { - owner, err := r.getOwnerMachineDeployment(ctx, ms) - if err != nil { - return ctrl.Result{}, err - } - + if isDeploymentChild(ms) { if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { // MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed. return ctrl.Result{}, nil @@ -1185,38 +1303,40 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, cluster *cl return ctrl.Result{}, nil } -func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, ref *corev1.ObjectReference) error { +func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, owner *clusterv1.MachineDeployment, ref *corev1.ObjectReference) (objectNotFound bool, err error) { if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) { - return nil + return false, nil } if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil { - return err + return false, err } obj, err := external.Get(ctx, r.Client, ref, cluster.Namespace) if err != nil { if apierrors.IsNotFound(err) { - if _, ok := ms.Labels[clusterv1.MachineDeploymentNameLabel]; !ok { + if !ms.DeletionTimestamp.IsZero() { + // Tolerate object not found when the machineSet is being deleted. + return true, nil + } + + if owner == nil { // If the MachineSet is not in a MachineDeployment, return the error immediately. - return err + return true, err } // When the MachineSet is part of a MachineDeployment but isn't the current revision, we should // ignore the not found references and allow the controller to proceed. - owner, err := r.getOwnerMachineDeployment(ctx, ms) - if err != nil { - return err - } - if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] { - return nil + if !isCurrentMachineSet(ms, owner) { + return true, nil } + return true, err } - return err + return false, err } patchHelper, err := patch.NewHelper(obj, r.Client) if err != nil { - return err + return false, err } obj.SetOwnerReferences(util.EnsureOwnerRef(obj.GetOwnerReferences(), metav1.OwnerReference{ @@ -1226,5 +1346,5 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu UID: cluster.UID, })) - return patchHelper.Patch(ctx, obj) + return false, patchHelper.Patch(ctx, obj) } diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go new file mode 100644 index 000000000000..14ddfc2e4345 --- /dev/null +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -0,0 +1,350 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machineset + +import ( + "context" + "fmt" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" + clog "sigs.k8s.io/cluster-api/util/log" +) + +// reconcileV1Beta2Status reconciles MachineSet's status during the entire lifecycle of the MachineSet. +// Additionally, this func should ensure that the conditions managed by this controller are always set in order to +// comply with the recommendation in the Kubernetes API guidelines. +// Note: v1beta1 conditions are not managed by this func. +func (r *Reconciler) reconcileV1Beta2Status(ctx context.Context, s *scope) { + // Update the following fields in status from the machines list. + // - v1beta2.readyReplicas + // - v1beta2.availableReplicas + // - v1beta2.upToDateReplicas + setReplicas(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // Conditions + + // Update the ScalingUp and ScalingDown condition. + setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded) + setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // MachinesReady condition: aggregate the Machine's Ready condition. + setMachinesReadyCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // MachinesUpToDate condition: aggregate the Machine's UpToDate condition. + setMachinesUpToDateCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded) + + // TODO Deleting +} + +func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // Return early when getAndAdoptMachinesForMachineSetSucceeded is false because it's not possible to calculate replica counters. + if !getAndAdoptMachinesForMachineSetSucceeded { + return + } + + var readyReplicas, availableReplicas, upToDateReplicas int32 + for _, machine := range machines { + if v1beta2conditions.IsTrue(machine, clusterv1.MachineReadyV1Beta2Condition) { + readyReplicas++ + } + if v1beta2conditions.IsTrue(machine, clusterv1.MachineAvailableV1Beta2Condition) { + availableReplicas++ + } + if v1beta2conditions.IsTrue(machine, clusterv1.MachineUpToDateV1Beta2Condition) { + upToDateReplicas++ + } + } + + if ms.Status.V1Beta2 == nil { + ms.Status.V1Beta2 = &clusterv1.MachineSetV1Beta2Status{} + } + + ms.Status.V1Beta2.ReadyReplicas = ptr.To(readyReplicas) + ms.Status.V1Beta2.AvailableReplicas = ptr.To(availableReplicas) + ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas) +} + +func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingUpInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + // Surface if .spec.replicas is not yet set (this should never happen). + if ms.Spec.Replicas == nil { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingUpWaitingForReplicasSetV1Beta2Reason, + }) + return + } + + desiredReplicas := *ms.Spec.Replicas + if !ms.DeletionTimestamp.IsZero() { + desiredReplicas = 0 + } + currentReplicas := ms.Status.Replicas + + missingReferencesMessage := calculateMissingReferencesMessage(ms, bootstrapObjectNotFound, infrastructureObjectNotFound) + + if currentReplicas >= desiredReplicas { + var message string + if missingReferencesMessage != "" { + message = fmt.Sprintf("Scaling up can't happen %s", missingReferencesMessage) + } + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingUpV1Beta2Reason, + Message: message, + }) + return + } + + // Scaling up. + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingUpV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingUpV1Beta2Reason, + // Message: message, + Message: bulletMessage( + fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas), + aggregateMachinesFunc( + machines, + func(machine *clusterv1.Machine) bool { + return machine.Status.NodeRef == nil && time.Since(machine.GetCreationTimestamp().Time) >= time.Minute*30 + }, + "not reporting a .status.nodeRef by more than 30 minutes", + ), + func() (message string) { + if missingReferencesMessage != "" { + message = fmt.Sprintf("%s is blocked %s", message, missingReferencesMessage) + } + return message + }), + }) +} + +func setScalingDownCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingDownInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + // Surface if .spec.replicas is not yet set (this should never happen). + // This could e.g. be the case when using autoscaling with MachineSets. + if ms.Spec.Replicas == nil { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetScalingDownWaitingForReplicasSetV1Beta2Reason, + }) + return + } + + desiredReplicas := *ms.Spec.Replicas + // Deletion is equal to 0 desired replicas. + if !ms.DeletionTimestamp.IsZero() { + desiredReplicas = 0 + } + + // Scaling down. + if int32(len(machines)) > (desiredReplicas) { + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: bulletMessage( + fmt.Sprintf("Scaling down from %d to %d replicas", len(machines), desiredReplicas), + aggregateMachinesFunc( + machines, + func(machine *clusterv1.Machine) bool { + return !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) >= time.Minute*30 + }, + "reported in deletion by more than 30 minutes", + ), + ), + }) + return + } + + // Not scaling down. + v1beta2conditions.Set(ms, metav1.Condition{ + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineSetNotScalingDownV1Beta2Reason, + }) +} + +func setMachinesReadyCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Message: "Please check controller logs for errors", + }) + return + } + + if len(machines) == 0 { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesReadyNoReplicasV1Beta2Reason, + }) + return + } + + readyCondition, err := v1beta2conditions.NewAggregateCondition( + machines, clusterv1.MachineReadyV1Beta2Condition, + v1beta2conditions.TargetConditionType(clusterv1.MachineSetMachinesReadyV1Beta2Condition), + ) + if err != nil { + log := ctrl.LoggerFrom(ctx) + log.Error(err, "Failed to aggregate Machine's Ready conditions") + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesReadyV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesReadyInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + v1beta2conditions.Set(machineSet, *readyCondition) +} + +func setMachinesUpToDateCondition(ctx context.Context, machineSet *clusterv1.MachineSet, machines []*clusterv1.Machine, getAndAdoptMachinesForMachineSetSucceeded bool) { + // If we got unexpected errors in listing the machines (this should never happen), surface them. + if !getAndAdoptMachinesForMachineSetSucceeded { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Message: "Please check controller logs for errors", + }) + return + } + + if len(machines) == 0 { + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetMachinesUpToDateNoReplicasV1Beta2Reason, + }) + return + } + + upToDateCondition, err := v1beta2conditions.NewAggregateCondition( + machines, clusterv1.MachinesUpToDateV1Beta2Condition, + v1beta2conditions.TargetConditionType(clusterv1.MachineSetMachinesUpToDateV1Beta2Condition), + ) + if err != nil { + log := ctrl.LoggerFrom(ctx) + log.Error(err, "Failed to aggregate Machine's UpToDate conditions") + v1beta2conditions.Set(machineSet, metav1.Condition{ + Type: clusterv1.MachineSetMachinesUpToDateV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: clusterv1.MachineSetMachinesUpToDateInternalErrorV1Beta2Reason, + Message: "Please check controller logs for errors", + }) + return + } + + v1beta2conditions.Set(machineSet, *upToDateCondition) +} + +func calculateMissingReferencesMessage(ms *clusterv1.MachineSet, bootstrapTemplateNotFound, infraMachineTemplateNotFound bool) string { + missingObjects := []string{} + if bootstrapTemplateNotFound { + missingObjects = append(missingObjects, ms.Spec.Template.Spec.Bootstrap.ConfigRef.Kind) + } + if infraMachineTemplateNotFound { + missingObjects = append(missingObjects, ms.Spec.Template.Spec.InfrastructureRef.Kind) + } + + if len(missingObjects) == 0 { + return "" + } + + if len(missingObjects) == 1 { + return fmt.Sprintf("because %s does not exist", missingObjects[0]) + } + + return fmt.Sprintf("because %s do not exist", strings.Join(missingObjects, " and ")) +} + +func bulletMessage(header string, bulletGenerators ...func() string) string { + lines := []string{} + + for _, bulletbulletGenerator := range bulletGenerators { + if s := bulletbulletGenerator(); s != "" { + lines = append(lines, fmt.Sprintf("* %s", s)) + } + } + + if len(lines) == 0 { + return header + } + + lines = append([]string{header}, lines...) + return strings.Join(lines, "\n") +} + +func aggregateMachinesFunc(machines []*clusterv1.Machine, filter func(*clusterv1.Machine) bool, suffix string) func() string { + return func() string { + machineNames := []string{} + for _, machine := range machines { + if filter(machine) { + machineNames = append(machineNames, machine.GetName()) + } + } + + if len(machineNames) == 0 { + return "" + } + + prefix := "Machine" + if len(machineNames) > 1 { + prefix += "s" + } + + return strings.Join([]string{prefix, clog.StringListToString(machineNames), suffix}, " ") + } +} diff --git a/internal/controllers/machineset/machineset_controller_test.go b/internal/controllers/machineset/machineset_controller_test.go index a9b1791c6792..6f0aa23a9817 100644 --- a/internal/controllers/machineset/machineset_controller_test.go +++ b/internal/controllers/machineset/machineset_controller_test.go @@ -589,6 +589,7 @@ func TestMachineSetReconcile(t *testing.T) { }, Spec: clusterv1.MachineSetSpec{ ClusterName: testClusterName, + Replicas: ptr.To[int32](0), }, Status: clusterv1.MachineSetStatus{ V1Beta2: &clusterv1.MachineSetV1Beta2Status{Conditions: []metav1.Condition{{ @@ -616,8 +617,10 @@ func TestMachineSetReconcile(t *testing.T) { g := NewWithT(t) ms := newMachineSet("machineset1", testClusterName, int32(0)) - ms.Spec.Selector.MatchLabels = map[string]string{ - "--$-invalid": "true", + ms.Spec.Template.Spec.Bootstrap.ConfigRef = &corev1.ObjectReference{ + Kind: "FooTemplate", + Namespace: ms.GetNamespace(), + Name: "doesnotexist", } request := reconcile.Request{ @@ -1058,8 +1061,14 @@ func TestMachineSetReconciler_updateStatusResizedCondition(t *testing.T) { Client: c, recorder: record.NewFakeRecorder(32), } - err := msr.updateStatus(ctx, cluster, tc.machineSet, tc.machines) - g.Expect(err).ToNot(HaveOccurred()) + s := &scope{ + cluster: cluster, + machineSet: tc.machineSet, + machines: tc.machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + setReplicas(ctx, s.machineSet, s.machines, tc.machines != nil) + g.Expect(msr.reconcileStatus(ctx, s)).To(Succeed()) gotCond := conditions.Get(tc.machineSet, clusterv1.ResizedCondition) g.Expect(gotCond).ToNot(BeNil()) g.Expect(gotCond.Status).To(Equal(corev1.ConditionFalse)) @@ -1296,7 +1305,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { Client: env, ssaCache: ssa.NewCache(), } - g.Expect(reconciler.syncMachines(ctx, ms, machines)).To(Succeed()) + s := &scope{ + machineSet: ms, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) // The inPlaceMutatingMachine should have cleaned up managed fields. updatedInPlaceMutatingMachine := inPlaceMutatingMachine.DeepCopy() @@ -1372,7 +1387,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { ms.Spec.Template.Spec.NodeDrainTimeout = duration10s ms.Spec.Template.Spec.NodeDeletionTimeout = duration10s ms.Spec.Template.Spec.NodeVolumeDetachTimeout = duration10s - g.Expect(reconciler.syncMachines(ctx, ms, []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine})).To(Succeed()) + s = &scope{ + machineSet: ms, + machines: []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) // Verify in-place mutable fields are updated on the Machine. updatedInPlaceMutatingMachine = inPlaceMutatingMachine.DeepCopy() @@ -1452,7 +1473,13 @@ func TestMachineSetReconciler_syncMachines(t *testing.T) { ms.Spec.Template.Spec.NodeDrainTimeout = duration11s ms.Spec.Template.Spec.NodeDeletionTimeout = duration11s ms.Spec.Template.Spec.NodeVolumeDetachTimeout = duration11s - g.Expect(reconciler.syncMachines(ctx, ms, []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine})).To(Succeed()) + s = &scope{ + machineSet: ms, + machines: []*clusterv1.Machine{updatedInPlaceMutatingMachine, deletingMachine}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = reconciler.syncMachines(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) updatedDeletingMachine := deletingMachine.DeepCopy() g.Expect(env.GetAPIReader().Get(ctx, client.ObjectKeyFromObject(updatedDeletingMachine), updatedDeletingMachine)).To(Succeed()) @@ -1519,7 +1546,15 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine is deleted. m := &clusterv1.Machine{} @@ -1577,7 +1612,13 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, machines) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: machines, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine has the updated condition. @@ -1683,8 +1724,16 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { Client: fakeClient, } + s := &scope{ + cluster: cluster, + machineSet: machineSetOld, + machines: machines, + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + // Test first with the old MachineSet. - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSetOld, machines) + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) condition := clusterv1.MachineOwnerRemediatedCondition @@ -1707,7 +1756,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { To(BeFalse(), "Machine should not have the %s condition set", condition) // Test with the current MachineSet. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSetCurrent, machines) + s = &scope{ + cluster: cluster, + machineSet: machineSetCurrent, + machines: machines, + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Verify the unhealthy machine has been deleted. @@ -1814,7 +1870,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // // First pass. // - _, err := r.reconcileUnhealthyMachines(ctx, cluster, machineSet, append(unhealthyMachines, healthyMachine)) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: append(unhealthyMachines, healthyMachine), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err := r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) condition := clusterv1.MachineOwnerRemediatedCondition @@ -1865,7 +1928,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { return } - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) var validateSecondPass = func(cleanFinalizer bool) { @@ -1912,7 +1982,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Perform another pass with the same exact configuration. // This is testing that, given that we have Machines that are being deleted and are in flight, // we have reached the maximum amount of tokens we have and we should wait to remediate the rest. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Validate and remove finalizers for in flight machines. @@ -1926,7 +2003,14 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) { // Call again to verify that the remaining unhealthy machines are deleted, // at this point all unhealthy machines should be deleted given the max in flight // is greater than the number of unhealthy machines. - _, err = r.reconcileUnhealthyMachines(ctx, cluster, machineSet, allMachines()) + s = &scope{ + cluster: cluster, + machineSet: machineSet, + machines: allMachines(), + owningMachineDeployment: machineDeployment, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + _, err = r.reconcileUnhealthyMachines(ctx, s) g.Expect(err).ToNot(HaveOccurred()) // Iterate over the unhealthy machines and verify that all were deleted. @@ -1976,7 +2060,13 @@ func TestMachineSetReconciler_syncReplicas(t *testing.T) { r := &Reconciler{ Client: fakeClient, } - result, err := r.syncReplicas(ctx, cluster, machineSet, nil) + s := &scope{ + cluster: cluster, + machineSet: machineSet, + machines: []*clusterv1.Machine{}, + getAndAdoptMachinesForMachineSetSucceeded: true, + } + result, err := r.syncReplicas(ctx, s) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result.IsZero()).To(BeFalse(), "syncReplicas should not return a 'zero' result") @@ -2217,7 +2307,14 @@ func TestReconciler_reconcileDelete(t *testing.T) { recorder: record.NewFakeRecorder(32), } - err := r.reconcileDelete(ctx, tt.machineSet) + s := &scope{ + machineSet: tt.machineSet, + } + + // populate s.machines + _, err := r.getAndAdoptMachinesForMachineSet(ctx, s) + g.Expect(err).ToNot(HaveOccurred()) + _, err = r.reconcileDelete(ctx, s) if tt.expectError { g.Expect(err).To(HaveOccurred()) } else { diff --git a/util/log/log.go b/util/log/log.go index 11a41c460361..5f7432e51f0a 100644 --- a/util/log/log.go +++ b/util/log/log.go @@ -107,19 +107,16 @@ func getOwners(ctx context.Context, c client.Client, obj metav1.Object) ([]owner return owners, nil } -// ObjNamesString returns a comma separated list of the object names, limited to -// five objects. On more than five objects it outputs the first five objects and -// adds information about how much more are in the given list. -func ObjNamesString[T client.Object](objs []T) string { +// ListToString returns a comma-separated list of the first n entries of the list (strings are calculated via stringFunc). +func ListToString[T any](list []T, stringFunc func(T) string, n int) string { shortenedBy := 0 - if len(objs) > 5 { - shortenedBy = len(objs) - 5 - objs = objs[:5] + if len(list) > n { + shortenedBy = len(list) - n + list = list[:n] } - stringList := []string{} - for _, obj := range objs { - stringList = append(stringList, obj.GetName()) + for _, p := range list { + stringList = append(stringList, stringFunc(p)) } if shortenedBy > 0 { @@ -128,3 +125,21 @@ func ObjNamesString[T client.Object](objs []T) string { return strings.Join(stringList, ", ") } + +// StringListToString returns a comma separated list of the strings, limited to +// five objects. On more than five objects it outputs the first five objects and +// adds information about how much more are in the given list. +func StringListToString(objs []string) string { + return ListToString(objs, func(s string) string { + return s + }, 5) +} + +// ObjNamesString returns a comma separated list of the object names, limited to +// five objects. On more than five objects it outputs the first five objects and +// adds information about how much more are in the given list. +func ObjNamesString[T client.Object](objs []T) string { + return ListToString(objs, func(obj T) string { + return obj.GetName() + }, 5) +}