From d5f0d74ff43f285f667cbb93002fdca4f25f186c Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Wed, 4 Sep 2024 19:05:12 +0200 Subject: [PATCH 1/5] KCP: remove etcd member in pre-terminate hook --- .../v1beta1/kubeadm_control_plane_types.go | 4 + .../kubeadm/internal/control_plane.go | 5 + .../internal/controllers/controller.go | 111 +++++++++++++++++- .../internal/controllers/controller_test.go | 8 +- .../kubeadm/internal/controllers/helpers.go | 3 + .../internal/controllers/helpers_test.go | 4 + .../internal/controllers/remediation.go | 5 - .../kubeadm/internal/controllers/scale.go | 4 - test/e2e/clusterclass_rollout.go | 2 +- util/collections/machine_collection.go | 42 +++++++ 10 files changed, 172 insertions(+), 16 deletions(-) diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go index 1a9ccf073831..51d28b348fd0 100644 --- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go +++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go @@ -65,6 +65,10 @@ const ( // failures in updating remediation retry (the counter restarts from zero). RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for" + // PreTerminateDeleteHookAnnotation is the annotation KCP sets on Machines to ensure it can later remove the + // etcd member right before Machine termination (i.e. before InfraMachine deletion). + PreTerminateDeleteHookAnnotation = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/kubeadmcontrolplane" + // DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a // machine unrelated from the previous remediation. DefaultMinHealthyPeriod = 1 * time.Hour diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index bfed636b8326..6fd7a68a0bab 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -164,6 +164,11 @@ func (c *ControlPlane) HasDeletingMachine() bool { return len(c.Machines.Filter(collections.HasDeletionTimestamp)) > 0 } +// DeletingMachines returns machines in the control plane that are in the process of being deleted. +func (c *ControlPlane) DeletingMachines() collections.Machines { + return c.Machines.Filter(collections.HasDeletionTimestamp) +} + // GetKubeadmConfig returns the KubeadmConfig of a given machine. func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.KubeadmConfig, bool) { kubeadmConfig, ok := c.KubeadmConfigs[machineName] diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 383f9dfda1b5..cbbc694c6a93 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -397,6 +397,10 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl return ctrl.Result{}, err } + if result, err := r.reconcilePreTerminateHook(ctx, controlPlane); err != nil || !result.IsZero() { + return result, err + } + // Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate, // otherwise continue with the other KCP operations. if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() { @@ -566,11 +570,26 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con machinesToDelete := controlPlane.Machines.Filter(collections.Not(collections.HasDeletionTimestamp)) var errs []error for i := range machinesToDelete { - m := machinesToDelete[i] - logger := log.WithValues("Machine", klog.KObj(m)) - if err := r.Client.Delete(ctx, machinesToDelete[i]); err != nil && !apierrors.IsNotFound(err) { - logger.Error(err, "Failed to cleanup owned machine") - errs = append(errs, err) + machineToDelete := machinesToDelete[i] + log := log.WithValues("Machine", klog.KObj(machineToDelete)) + + // During KCP deletion we don't care about forwarding etcd leadership or removing etcd members. + // So we are removing the pre-terminate hook. + // This is important because after we went through the Machine deletion + // the etcd quorum will be broken and we can't forward etcd leadership or remove etcd members anymore. + // Also in this case the reconcileDelete code of the Machine controller won't execute Node drain + // and wait for volume detach. + log.Info("Removing pre-terminate hook from control plane machine") + deletingMachineOriginal := machineToDelete.DeepCopy() + delete(machineToDelete.Annotations, controlplanev1.PreTerminateDeleteHookAnnotation) + if err := r.Client.Patch(ctx, machineToDelete, client.MergeFrom(deletingMachineOriginal)); err != nil { + errs = append(errs, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(machineToDelete))) + continue + } + + log.Info("Deleting control plane Machine") + if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) { + errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete))) } } if len(errs) > 0 { @@ -791,6 +810,88 @@ func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context return nil } +func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + if !controlPlane.HasDeletingMachine() { + return ctrl.Result{}, nil + } + + log := ctrl.LoggerFrom(ctx) + + // Pick the Machine with the oldest deletionTimestamp to keep this function deterministic / reentrant + // so we only remove the pre-terminate hook from one Machine at a time. + deletingMachine := controlPlane.DeletingMachines().OldestDeletionTimestamp() + log = log.WithValues("Machine", klog.KObj(deletingMachine)) + ctx = ctrl.LoggerInto(ctx, log) + + // Return early if there are other pre-terminate hooks for the Machine. + // The KCP pre-terminate hook should be the one executed last, so that kubelet + // is still working while other pre-terminate hooks are run. + if machineHasOtherPreTerminateHooks(deletingMachine) { + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + } + + // Return early if we don't have to run the pre-terminate hook (usually because it was already completed). + // We are now waiting for the Machine to go away. + if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateDeleteHookAnnotation]; !exists { + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + } + + // Return early because the Machine controller is not yet waiting for the pre-terminate hook. + c := conditions.Get(deletingMachine, clusterv1.PreTerminateDeleteHookSucceededCondition) + if c == nil || c.Status != corev1.ConditionFalse || c.Reason != clusterv1.WaitingExternalHookReason { + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + } + + // The following will execute and remove the pre-terminate hook from the Machine. + + // If we have more than 1 Machine and etcd is managed we forward etcd leadership and remove the member + // to keep the etcd cluster healthy. + if controlPlane.Machines.Len() > 1 && controlPlane.IsEtcdManaged() { + workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s: failed to create client to workload cluster", klog.KObj(deletingMachine)) + } + + // Note: In regular deletion cases (remediation, scale down) the leader should have been already moved. + // We're doing this again here in case the Machine became leader again or the Machine deletion was + // triggered in another way (e.g. a user running kubectl delete machine) + etcdLeaderCandidate := controlPlane.Machines.Filter(collections.Not(collections.HasDeletionTimestamp)).Newest() + if etcdLeaderCandidate != nil { + if err := workloadCluster.ForwardEtcdLeadership(ctx, deletingMachine, etcdLeaderCandidate); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to move leadership to candidate Machine %s", etcdLeaderCandidate.Name) + } + } else { + log.Info("Skip forwarding etcd leadership, because there is no other control plane Machine without a deletionTimestamp") + } + + // Note: Removing the etcd member will lead to the etcd and the kube-apiserver Pod on the Machine shutting down. + // If ControlPlaneKubeletLocalMode is used, the kubelet is communicating with the local apiserver and thus now + // won't be able to see any updates to e.g. Pods anymore. + if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, deletingMachine); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s", klog.KObj(deletingMachine)) + } + } + + log.Info("Removing pre-terminate hook from control plane Machine") + deletingMachineOriginal := deletingMachine.DeepCopy() + delete(deletingMachine.Annotations, controlplanev1.PreTerminateDeleteHookAnnotation) + if err := r.Client.Patch(ctx, deletingMachine, client.MergeFrom(deletingMachineOriginal)); err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(deletingMachine)) + } + + log.Info("Waiting for Machines to be deleted", "machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", ")) + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil +} + +func machineHasOtherPreTerminateHooks(machine *clusterv1.Machine) bool { + for k := range machine.Annotations { + if strings.HasPrefix(k, clusterv1.PreTerminateDeleteHookAnnotationPrefix) && k != controlplanev1.PreTerminateDeleteHookAnnotation { + return true + } + } + return false +} + func (r *KubeadmControlPlaneReconciler) reconcileCertificateExpiries(ctx context.Context, controlPlane *internal.ControlPlane) error { log := ctrl.LoggerFrom(ctx) diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index 7376556a2bdf..300b2f1c55e0 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -1751,7 +1751,13 @@ func TestKubeadmControlPlaneReconciler_syncMachines(t *testing.T) { // Verify Labels g.Expect(updatedInplaceMutatingMachine.Labels).Should(Equal(expectedLabels)) // Verify Annotations - g.Expect(updatedInplaceMutatingMachine.Annotations).Should(Equal(kcp.Spec.MachineTemplate.ObjectMeta.Annotations)) + expectedAnnotations := map[string]string{} + for k, v := range kcp.Spec.MachineTemplate.ObjectMeta.Annotations { + expectedAnnotations[k] = v + } + // The pre-terminate annotation should always be added + expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" + g.Expect(updatedInplaceMutatingMachine.Annotations).Should(Equal(expectedAnnotations)) // Verify Node timeout values g.Expect(updatedInplaceMutatingMachine.Spec.NodeDrainTimeout).Should(And( Not(BeNil()), diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index f7e3a93f758d..5f1e3617faba 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -386,6 +386,9 @@ func (r *KubeadmControlPlaneReconciler) computeDesiredMachine(kcp *controlplanev annotations[controlplanev1.RemediationForAnnotation] = remediationData } } + // Setting pre-terminate hook so we can later remove the etcd member right before Machine termination + // (i.e. before InfraMachine deletion). + annotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" // Construct the basic Machine. desiredMachine := &clusterv1.Machine{ diff --git a/controlplane/kubeadm/internal/controllers/helpers_test.go b/controlplane/kubeadm/internal/controllers/helpers_test.go index eb5d8ffb6539..28300075072d 100644 --- a/controlplane/kubeadm/internal/controllers/helpers_test.go +++ b/controlplane/kubeadm/internal/controllers/helpers_test.go @@ -562,6 +562,8 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) { expectedAnnotations[k] = v } expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = clusterConfigurationString + // The pre-terminate annotation should always be added + expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" g.Expect(createdMachine.Annotations).To(Equal(expectedAnnotations)) // Verify that machineTemplate.ObjectMeta in KCP has not been modified. @@ -646,6 +648,8 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) { } expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = existingClusterConfigurationString expectedAnnotations[controlplanev1.RemediationForAnnotation] = remediationData + // The pre-terminate annotation should always be added + expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" g.Expect(updatedMachine.Annotations).To(Equal(expectedAnnotations)) // Verify that machineTemplate.ObjectMeta in KCP has not been modified. diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 52404d973651..08c246724b39 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -224,11 +224,6 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) return ctrl.Result{}, err } - if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil { - log.Error(err, "Failed to remove etcd member for machine") - conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, err - } } parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 43bcb37d87cc..5e5370f08150 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -135,10 +135,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) return ctrl.Result{}, err } - if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToDelete); err != nil { - logger.Error(err, "Failed to remove etcd member for machine") - return ctrl.Result{}, err - } } parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) diff --git a/test/e2e/clusterclass_rollout.go b/test/e2e/clusterclass_rollout.go index 579ccfe3c799..0397559a298d 100644 --- a/test/e2e/clusterclass_rollout.go +++ b/test/e2e/clusterclass_rollout.go @@ -465,7 +465,7 @@ func assertControlPlaneMachines(g Gomega, clusterObjects clusterObjects, cluster expectMapsToBeEquivalent(g, union( machineMetadata.Annotations, - ).without(g, controlplanev1.KubeadmClusterConfigurationAnnotation), + ).without(g, controlplanev1.KubeadmClusterConfigurationAnnotation, controlplanev1.PreTerminateDeleteHookAnnotation), controlPlaneMachineTemplateMetadata.Annotations, ) diff --git a/util/collections/machine_collection.go b/util/collections/machine_collection.go index 83a50ad9433d..10ecb48d690e 100644 --- a/util/collections/machine_collection.go +++ b/util/collections/machine_collection.go @@ -68,6 +68,30 @@ func (o machinesByCreationTimestamp) Less(i, j int) bool { return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp) } +// machinesByDeletionTimestamp sorts a list of Machine by creation timestamp, using their names as a tie breaker. +type machinesByDeletionTimestamp []*clusterv1.Machine + +func (o machinesByDeletionTimestamp) Len() int { return len(o) } +func (o machinesByDeletionTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] } +func (o machinesByDeletionTimestamp) Less(i, j int) bool { + if o[i].DeletionTimestamp == nil && o[j].DeletionTimestamp == nil { + return o[i].Name < o[j].Name + } + + if o[i].DeletionTimestamp == nil { + return false + } + + if o[j].DeletionTimestamp == nil { + return true + } + + if o[i].DeletionTimestamp.Equal(o[j].DeletionTimestamp) { + return o[i].Name < o[j].Name + } + return o[i].DeletionTimestamp.Before(o[j].DeletionTimestamp) +} + // New creates an empty Machines. func New() Machines { return make(Machines) @@ -126,6 +150,16 @@ func (s Machines) SortedByCreationTimestamp() []*clusterv1.Machine { return res } +// SortedByDeletionTimestamp returns the machines sorted by deletion timestamp. +func (s Machines) SortedByDeletionTimestamp() []*clusterv1.Machine { + res := make(machinesByDeletionTimestamp, 0, len(s)) + for _, value := range s { + res = append(res, value) + } + sort.Sort(res) + return res +} + // UnsortedList returns the slice with contents in random order. func (s Machines) UnsortedList() []*clusterv1.Machine { res := make([]*clusterv1.Machine, 0, len(s)) @@ -178,6 +212,14 @@ func (s Machines) Newest() *clusterv1.Machine { return s.SortedByCreationTimestamp()[len(s)-1] } +// OldestDeletionTimestamp returns the Machine with the oldest DeletionTimestamp. +func (s Machines) OldestDeletionTimestamp() *clusterv1.Machine { + if len(s) == 0 { + return nil + } + return s.SortedByDeletionTimestamp()[0] +} + // DeepCopy returns a deep copy. func (s Machines) DeepCopy() Machines { result := make(Machines, len(s)) From 2740c2604b65b137ebbbd9864ca25ec1262b15ca Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 5 Sep 2024 14:17:36 +0200 Subject: [PATCH 2/5] fix review findings & add unit tests --- .../v1beta1/kubeadm_control_plane_types.go | 6 ++-- .../internal/controllers/controller.go | 36 +++++++++++++------ .../internal/controllers/controller_test.go | 24 ++++++++++--- .../kubeadm/internal/controllers/helpers.go | 2 +- .../internal/controllers/helpers_test.go | 4 +-- .../internal/controllers/remediation.go | 2 ++ .../kubeadm/internal/controllers/scale.go | 2 ++ test/e2e/clusterclass_rollout.go | 2 +- util/collections/machine_collection.go | 3 +- util/collections/machine_collection_test.go | 32 +++++++++++++---- 10 files changed, 85 insertions(+), 28 deletions(-) diff --git a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go index 51d28b348fd0..88f5395a34c1 100644 --- a/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go +++ b/controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go @@ -65,9 +65,11 @@ const ( // failures in updating remediation retry (the counter restarts from zero). RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for" - // PreTerminateDeleteHookAnnotation is the annotation KCP sets on Machines to ensure it can later remove the + // PreTerminateHookCleanupAnnotation is the annotation KCP sets on Machines to ensure it can later remove the // etcd member right before Machine termination (i.e. before InfraMachine deletion). - PreTerminateDeleteHookAnnotation = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/kubeadmcontrolplane" + // Note: Starting with Kubernetes v1.31 this hook will wait for all other pre-terminate hooks to finish to + // ensure it runs last (thus ensuring that kubelet is still working while other pre-terminate hooks run). + PreTerminateHookCleanupAnnotation = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/kcp-cleanup" // DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a // machine unrelated from the previous remediation. diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index cbbc694c6a93..c9fe1ae87cb4 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -569,19 +569,19 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con // Delete control plane machines in parallel machinesToDelete := controlPlane.Machines.Filter(collections.Not(collections.HasDeletionTimestamp)) var errs []error - for i := range machinesToDelete { - machineToDelete := machinesToDelete[i] + for _, machineToDelete := range machinesToDelete { log := log.WithValues("Machine", klog.KObj(machineToDelete)) + ctx := ctrl.LoggerInto(ctx, log) // During KCP deletion we don't care about forwarding etcd leadership or removing etcd members. // So we are removing the pre-terminate hook. - // This is important because after we went through the Machine deletion - // the etcd quorum will be broken and we can't forward etcd leadership or remove etcd members anymore. + // This is important because when deleting KCP we will delete all members of etcd and it's not possible + // to forward etcd leadership without any member left after we went through the Machine deletion. // Also in this case the reconcileDelete code of the Machine controller won't execute Node drain // and wait for volume detach. log.Info("Removing pre-terminate hook from control plane machine") deletingMachineOriginal := machineToDelete.DeepCopy() - delete(machineToDelete.Annotations, controlplanev1.PreTerminateDeleteHookAnnotation) + delete(machineToDelete.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation) if err := r.Client.Patch(ctx, machineToDelete, client.MergeFrom(deletingMachineOriginal)); err != nil { errs = append(errs, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(machineToDelete))) continue @@ -817,22 +817,38 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co log := ctrl.LoggerFrom(ctx) + // Return early, if there is already a deleting Machine without the pre-terminate hook. + // We are going to wait until this Machine goes away before running the pre-terminate hook on other Machines. + for _, deletingMachine := range controlPlane.DeletingMachines() { + if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation]; !exists { + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + } + } + // Pick the Machine with the oldest deletionTimestamp to keep this function deterministic / reentrant // so we only remove the pre-terminate hook from one Machine at a time. deletingMachine := controlPlane.DeletingMachines().OldestDeletionTimestamp() log = log.WithValues("Machine", klog.KObj(deletingMachine)) ctx = ctrl.LoggerInto(ctx, log) + parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to parse Kubernetes version %q", controlPlane.KCP.Spec.Version) + } + // Return early if there are other pre-terminate hooks for the Machine. // The KCP pre-terminate hook should be the one executed last, so that kubelet // is still working while other pre-terminate hooks are run. - if machineHasOtherPreTerminateHooks(deletingMachine) { - return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + // Note: This is done only for Kubernetes >= v1.31 to reduce the blast radius of this check. + if version.Compare(parsedVersion, semver.MustParse("1.31.0"), version.WithoutPreReleases()) >= 0 { + if machineHasOtherPreTerminateHooks(deletingMachine) { + return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil + } } // Return early if we don't have to run the pre-terminate hook (usually because it was already completed). // We are now waiting for the Machine to go away. - if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateDeleteHookAnnotation]; !exists { + if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation]; !exists { return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil } @@ -874,7 +890,7 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co log.Info("Removing pre-terminate hook from control plane Machine") deletingMachineOriginal := deletingMachine.DeepCopy() - delete(deletingMachine.Annotations, controlplanev1.PreTerminateDeleteHookAnnotation) + delete(deletingMachine.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation) if err := r.Client.Patch(ctx, deletingMachine, client.MergeFrom(deletingMachineOriginal)); err != nil { return ctrl.Result{}, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(deletingMachine)) } @@ -885,7 +901,7 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co func machineHasOtherPreTerminateHooks(machine *clusterv1.Machine) bool { for k := range machine.Annotations { - if strings.HasPrefix(k, clusterv1.PreTerminateDeleteHookAnnotationPrefix) && k != controlplanev1.PreTerminateDeleteHookAnnotation { + if strings.HasPrefix(k, clusterv1.PreTerminateDeleteHookAnnotationPrefix) && k != controlplanev1.PreTerminateHookCleanupAnnotation { return true } } diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index 300b2f1c55e0..ae157788427e 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -1756,7 +1756,7 @@ func TestKubeadmControlPlaneReconciler_syncMachines(t *testing.T) { expectedAnnotations[k] = v } // The pre-terminate annotation should always be added - expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" + expectedAnnotations[controlplanev1.PreTerminateHookCleanupAnnotation] = "" g.Expect(updatedInplaceMutatingMachine.Annotations).Should(Equal(expectedAnnotations)) // Verify Node timeout values g.Expect(updatedInplaceMutatingMachine.Spec.NodeDrainTimeout).Should(And( @@ -2108,6 +2108,9 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { machines := collections.New() for i := range 3 { m, _ := createMachineNodePair(fmt.Sprintf("test-%d", i), cluster, kcp, true) + m.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation] = "" + // Note: Block deletion so we can later verify the pre-terminate hook was removed + m.Finalizers = []string{"cluster.x-k8s.io/block-deletion"} initObjs = append(initObjs, m) machines.Insert(m) } @@ -2138,6 +2141,18 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { controlPlaneMachines := clusterv1.MachineList{} g.Expect(fakeClient.List(ctx, &controlPlaneMachines)).To(Succeed()) + for _, machine := range controlPlaneMachines.Items { + // Verify pre-terminate hook was removed + g.Expect(machine.Annotations).ToNot(HaveKey(controlplanev1.PreTerminateHookCleanupAnnotation)) + + // Remove finalizer + originalMachine := machine.DeepCopy() + machine.Finalizers = []string{} + g.Expect(fakeClient.Patch(ctx, &machine, client.MergeFrom(originalMachine))).To(Succeed()) + } + + // Verify all Machines are gone. + g.Expect(fakeClient.List(ctx, &controlPlaneMachines)).To(Succeed()) g.Expect(controlPlaneMachines.Items).To(BeEmpty()) controlPlane = &internal.ControlPlane{ @@ -2411,9 +2426,10 @@ func createMachineNodePair(name string, cluster *clusterv1.Cluster, kcp *control APIVersion: clusterv1.GroupVersion.String(), }, ObjectMeta: metav1.ObjectMeta{ - Namespace: cluster.Namespace, - Name: name, - Labels: internal.ControlPlaneMachineLabelsForCluster(kcp, cluster.Name), + Namespace: cluster.Namespace, + Name: name, + Labels: internal.ControlPlaneMachineLabelsForCluster(kcp, cluster.Name), + Annotations: map[string]string{}, OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(kcp, controlplanev1.GroupVersion.WithKind("KubeadmControlPlane")), }, diff --git a/controlplane/kubeadm/internal/controllers/helpers.go b/controlplane/kubeadm/internal/controllers/helpers.go index 5f1e3617faba..98dcfd0d6ba8 100644 --- a/controlplane/kubeadm/internal/controllers/helpers.go +++ b/controlplane/kubeadm/internal/controllers/helpers.go @@ -388,7 +388,7 @@ func (r *KubeadmControlPlaneReconciler) computeDesiredMachine(kcp *controlplanev } // Setting pre-terminate hook so we can later remove the etcd member right before Machine termination // (i.e. before InfraMachine deletion). - annotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" + annotations[controlplanev1.PreTerminateHookCleanupAnnotation] = "" // Construct the basic Machine. desiredMachine := &clusterv1.Machine{ diff --git a/controlplane/kubeadm/internal/controllers/helpers_test.go b/controlplane/kubeadm/internal/controllers/helpers_test.go index 28300075072d..28108cf9b81b 100644 --- a/controlplane/kubeadm/internal/controllers/helpers_test.go +++ b/controlplane/kubeadm/internal/controllers/helpers_test.go @@ -563,7 +563,7 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) { } expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = clusterConfigurationString // The pre-terminate annotation should always be added - expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" + expectedAnnotations[controlplanev1.PreTerminateHookCleanupAnnotation] = "" g.Expect(createdMachine.Annotations).To(Equal(expectedAnnotations)) // Verify that machineTemplate.ObjectMeta in KCP has not been modified. @@ -649,7 +649,7 @@ func TestKubeadmControlPlaneReconciler_computeDesiredMachine(t *testing.T) { expectedAnnotations[controlplanev1.KubeadmClusterConfigurationAnnotation] = existingClusterConfigurationString expectedAnnotations[controlplanev1.RemediationForAnnotation] = remediationData // The pre-terminate annotation should always be added - expectedAnnotations[controlplanev1.PreTerminateDeleteHookAnnotation] = "" + expectedAnnotations[controlplanev1.PreTerminateHookCleanupAnnotation] = "" g.Expect(updatedMachine.Annotations).To(Equal(expectedAnnotations)) // Verify that machineTemplate.ObjectMeta in KCP has not been modified. diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 08c246724b39..7b648441803b 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -224,6 +224,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error()) return ctrl.Result{}, err } + + // NOTE: etcd member removal will be performed by the kcp-cleanup hook after machine completes drain & all volumes are detached. } parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 5e5370f08150..304ba9855f87 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -135,6 +135,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name) return ctrl.Result{}, err } + + // NOTE: etcd member removal will be performed by the kcp-cleanup hook after machine completes drain & all volumes are detached. } parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) diff --git a/test/e2e/clusterclass_rollout.go b/test/e2e/clusterclass_rollout.go index 0397559a298d..84a1233292b2 100644 --- a/test/e2e/clusterclass_rollout.go +++ b/test/e2e/clusterclass_rollout.go @@ -465,7 +465,7 @@ func assertControlPlaneMachines(g Gomega, clusterObjects clusterObjects, cluster expectMapsToBeEquivalent(g, union( machineMetadata.Annotations, - ).without(g, controlplanev1.KubeadmClusterConfigurationAnnotation, controlplanev1.PreTerminateDeleteHookAnnotation), + ).without(g, controlplanev1.KubeadmClusterConfigurationAnnotation, controlplanev1.PreTerminateHookCleanupAnnotation), controlPlaneMachineTemplateMetadata.Annotations, ) diff --git a/util/collections/machine_collection.go b/util/collections/machine_collection.go index 10ecb48d690e..a8c686a8a979 100644 --- a/util/collections/machine_collection.go +++ b/util/collections/machine_collection.go @@ -68,7 +68,8 @@ func (o machinesByCreationTimestamp) Less(i, j int) bool { return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp) } -// machinesByDeletionTimestamp sorts a list of Machine by creation timestamp, using their names as a tie breaker. +// machinesByDeletionTimestamp sorts a list of Machines by deletion timestamp, using their names as a tie breaker. +// Machines without DeletionTimestamp go after machines with this field set. type machinesByDeletionTimestamp []*clusterv1.Machine func (o machinesByDeletionTimestamp) Len() int { return len(o) } diff --git a/util/collections/machine_collection_test.go b/util/collections/machine_collection_test.go index df6b14c10b82..f1470671731c 100644 --- a/util/collections/machine_collection_test.go +++ b/util/collections/machine_collection_test.go @@ -29,7 +29,7 @@ import ( ) func TestMachineCollection(t *testing.T) { - t.Run("SortedByAge", func(t *testing.T) { + t.Run("SortedByCreationTimestamp", func(t *testing.T) { t.Run("should return the same number of machines as are in the collection", func(t *testing.T) { g := NewWithT(t) collection := machines() @@ -37,6 +37,23 @@ func TestMachineCollection(t *testing.T) { g.Expect(sortedMachines).To(HaveLen(len(collection))) g.Expect(sortedMachines[0].Name).To(Equal("machine-1")) g.Expect(sortedMachines[len(sortedMachines)-1].Name).To(Equal("machine-5")) + g.Expect(collection.Oldest().Name).To(Equal("machine-1")) + }) + }) + t.Run("SortedByDeletionTimestamp", func(t *testing.T) { + t.Run("should return the same number of machines as are in the collection", func(t *testing.T) { + g := NewWithT(t) + collection := machines() + // Adding Machines without deletionTimestamp. + collection["machine-6"] = machine("machine-6") + collection["machine-7"] = machine("machine-7") + collection["machine-8"] = machine("machine-8") + + sortedMachines := collection.SortedByDeletionTimestamp() + g.Expect(sortedMachines).To(HaveLen(len(collection))) + g.Expect(sortedMachines[0].Name).To(Equal("machine-1")) + g.Expect(sortedMachines[len(sortedMachines)-1].Name).To(Equal("machine-8")) + g.Expect(collection.OldestDeletionTimestamp().Name).To(Equal("machine-1")) }) }) t.Run("Difference", func(t *testing.T) { @@ -146,9 +163,10 @@ func TestMachinesLowestVersion(t *testing.T) { type machineOpt func(*clusterv1.Machine) -func withCreationTimestamp(timestamp metav1.Time) machineOpt { +func withTimestamps(timestamp metav1.Time) machineOpt { return func(m *clusterv1.Machine) { m.CreationTimestamp = timestamp + m.DeletionTimestamp = ×tamp } } @@ -166,10 +184,10 @@ func machine(name string, opts ...machineOpt) *clusterv1.Machine { func machines() collections.Machines { return collections.Machines{ - "machine-4": machine("machine-4", withCreationTimestamp(metav1.Time{Time: time.Date(2018, 04, 02, 03, 04, 05, 06, time.UTC)})), - "machine-5": machine("machine-5", withCreationTimestamp(metav1.Time{Time: time.Date(2018, 05, 02, 03, 04, 05, 06, time.UTC)})), - "machine-2": machine("machine-2", withCreationTimestamp(metav1.Time{Time: time.Date(2018, 02, 02, 03, 04, 05, 06, time.UTC)})), - "machine-1": machine("machine-1", withCreationTimestamp(metav1.Time{Time: time.Date(2018, 01, 02, 03, 04, 05, 06, time.UTC)})), - "machine-3": machine("machine-3", withCreationTimestamp(metav1.Time{Time: time.Date(2018, 03, 02, 03, 04, 05, 06, time.UTC)})), + "machine-4": machine("machine-4", withTimestamps(metav1.Time{Time: time.Date(2018, 04, 02, 03, 04, 05, 06, time.UTC)})), + "machine-5": machine("machine-5", withTimestamps(metav1.Time{Time: time.Date(2018, 05, 02, 03, 04, 05, 06, time.UTC)})), + "machine-2": machine("machine-2", withTimestamps(metav1.Time{Time: time.Date(2018, 02, 02, 03, 04, 05, 06, time.UTC)})), + "machine-1": machine("machine-1", withTimestamps(metav1.Time{Time: time.Date(2018, 01, 02, 03, 04, 05, 06, time.UTC)})), + "machine-3": machine("machine-3", withTimestamps(metav1.Time{Time: time.Date(2018, 03, 02, 03, 04, 05, 06, time.UTC)})), } } From e1ef276a6a55237466db2359d1dbeeb050e49720 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 5 Sep 2024 15:37:24 +0200 Subject: [PATCH 3/5] Add unit tests for reconcilePreTerminateHook --- .../internal/controllers/controller.go | 6 - .../internal/controllers/controller_test.go | 386 +++++++++++++++++- .../internal/controllers/fakes_test.go | 37 +- .../internal/controllers/remediation_test.go | 68 +-- .../internal/controllers/scale_test.go | 12 +- .../internal/controllers/status_test.go | 10 +- .../internal/controllers/upgrade_test.go | 6 +- 7 files changed, 443 insertions(+), 82 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index c9fe1ae87cb4..2dd618d220fb 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -846,12 +846,6 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co } } - // Return early if we don't have to run the pre-terminate hook (usually because it was already completed). - // We are now waiting for the Machine to go away. - if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation]; !exists { - return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil - } - // Return early because the Machine controller is not yet waiting for the pre-terminate hook. c := conditions.Get(deletingMachine, clusterv1.PreTerminateDeleteHookSucceededCondition) if c == nil || c.Status != corev1.ConditionFalse || c.Reason != clusterv1.WaitingExternalHookReason { diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index ae157788427e..b645c0590c41 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -450,7 +450,7 @@ func TestReconcileClusterNoEndpoints(t *testing.T) { recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, } @@ -493,7 +493,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { fmc := &fakeManagementCluster{ Machines: collections.Machines{}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } objs := []client.Object{builder.GenericInfrastructureMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), tmpl.DeepCopy()} for i := range 3 { @@ -561,7 +561,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { fmc := &fakeManagementCluster{ Machines: collections.Machines{}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } objs := []client.Object{builder.GenericInfrastructureMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), tmpl.DeepCopy()} for i := range 3 { @@ -676,7 +676,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { fmc := &fakeManagementCluster{ Machines: collections.Machines{}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } objs := []client.Object{builder.GenericInfrastructureMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), tmpl.DeepCopy()} for i := range 3 { @@ -756,7 +756,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) { }, }, }, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } fakeClient := newFakeClient(builder.GenericInfrastructureMachineTemplateCRD, cluster.DeepCopy(), kcp.DeepCopy(), tmpl.DeepCopy(), fmc.Machines["test0"].DeepCopy()) @@ -1133,7 +1133,7 @@ func TestReconcileCertificateExpiries(t *testing.T) { ) managementCluster := &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ APIServerCertificateExpiry: &detectedExpiry, }, } @@ -1321,7 +1321,7 @@ kubernetesVersion: metav1.16.1 recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: env}, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Workload: &internal.Workload{ Client: env, }, @@ -1330,7 +1330,7 @@ kubernetesVersion: metav1.16.1 }, managementClusterUncached: &fakeManagementCluster{ Management: &internal.Management{Client: env}, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Workload: &internal.Workload{ Client: env, }, @@ -1826,6 +1826,368 @@ func TestKubeadmControlPlaneReconciler_syncMachines(t *testing.T) { g.Expect(updatedDeletingMachine.Spec).Should(BeComparableTo(deletingMachine.Spec)) } +func TestKubeadmControlPlaneReconciler_reconcilePreTerminateHook(t *testing.T) { + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster", + }, + } + machine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine", + Annotations: map[string]string{ + controlplanev1.PreTerminateHookCleanupAnnotation: "", + }, + }, + } + deletingMachine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deleting-machine", + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + Finalizers: []string{clusterv1.MachineFinalizer}, + }, + } + deletingMachineWithKCPPreTerminateHook := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deleting-machine-with-kcp-pre-terminate-hook", + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + Finalizers: []string{clusterv1.MachineFinalizer}, + Annotations: map[string]string{ + controlplanev1.PreTerminateHookCleanupAnnotation: "", + }, + }, + } + deletingMachineWithKCPAndOtherPreTerminateHooksOld := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deleting-machine-with-kcp-and-other-pre-terminate-hooks", + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-1 * time.Duration(1) * time.Minute)}, + Finalizers: []string{clusterv1.MachineFinalizer}, + Annotations: map[string]string{ + controlplanev1.PreTerminateHookCleanupAnnotation: "", + clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/test": "", + }, + }, + } + + tests := []struct { + name string + controlPlane *internal.ControlPlane + wantResult ctrl.Result + wantErr string + wantForwardEtcdLeadershipCalled int + wantRemoveEtcdMemberForMachineCalled int + wantMachineAnnotations map[string]map[string]string + }{ + { + name: "Do nothing if there are no deleting Machines", + controlPlane: &internal.ControlPlane{ + Machines: collections.Machines{ + machine.Name: machine, + }, + }, + wantResult: ctrl.Result{}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + machine.Name: machine.Annotations, + }, + }, + { + name: "Requeue, if there is a deleting Machine without the KCP pre-terminate hook", + controlPlane: &internal.ControlPlane{ + Machines: collections.Machines{ + deletingMachine.Name: deletingMachine, // Does not have the pre-terminate hook anymore. + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook, + deletingMachineWithKCPAndOtherPreTerminateHooksOld.Name: deletingMachineWithKCPAndOtherPreTerminateHooksOld, + }, + }, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + deletingMachine.Name: deletingMachine.Annotations, + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + deletingMachineWithKCPAndOtherPreTerminateHooksOld.Name: deletingMachineWithKCPAndOtherPreTerminateHooksOld.Annotations, + }, + }, + { + name: "Requeue, if the oldest deleting Machine has other pre-terminate hooks with Kubernetes 1.31", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook, + }, + }, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + }, + }, + { + name: "Requeue, if the deleting Machine has no PreTerminateDeleteHookSucceeded condition", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook, + }, + }, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + }, + }, + { + name: "Requeue, if the deleting Machine has PreTerminateDeleteHookSucceeded condition true", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + conditions.MarkTrue(m, clusterv1.PreTerminateDeleteHookSucceededCondition) + return m + }(), + }, + }, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + }, + }, + { + name: "Requeue, if the deleting Machine has PreTerminateDeleteHookSucceeded condition false but not waiting for hook", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, "some-other-reason", clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + }, + }, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + // Annotation are unchanged. + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + }, + }, + { + name: "Forward etcd leadership, remove member and remove pre-terminate hook if > 1 CP Machines && Etcd is managed", + controlPlane: &internal.ControlPlane{ + Cluster: cluster, + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + machine.Name: machine, // Leadership will be forwarded to this Machine. + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + }, + }, + wantForwardEtcdLeadershipCalled: 1, + wantRemoveEtcdMemberForMachineCalled: 1, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + wantMachineAnnotations: map[string]map[string]string{ + machine.Name: machine.Annotations, // unchanged + deletingMachineWithKCPPreTerminateHook.Name: nil, // pre-terminate hook has been removed + }, + }, + { + name: "Skip forward etcd leadership (no other non-deleting Machine), remove member and remove pre-terminate hook if > 1 CP Machines && Etcd is managed", + controlPlane: &internal.ControlPlane{ + Cluster: cluster, + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + m.DeletionTimestamp.Time = m.DeletionTimestamp.Add(-1 * time.Duration(1) * time.Second) // Make sure this (the oldest) Machine is selected to run the pre-terminate hook. + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + deletingMachineWithKCPPreTerminateHook.Name + "-2": func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + m.Name += "-2" + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + }, + }, + wantForwardEtcdLeadershipCalled: 0, // skipped as there is no non-deleting Machine to forward to. + wantRemoveEtcdMemberForMachineCalled: 1, + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: nil, // pre-terminate hook has been removed + deletingMachineWithKCPPreTerminateHook.Name + "-2": deletingMachineWithKCPPreTerminateHook.Annotations, // unchanged + }, + }, + { + name: "Skip forward etcd leadership, skip remove member and remove pre-terminate hook if 1 CP Machine && Etcd is managed", + controlPlane: &internal.ControlPlane{ + Cluster: cluster, + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + }, + }, + Machines: collections.Machines{ + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + }, + }, + wantForwardEtcdLeadershipCalled: 0, // skipped + wantRemoveEtcdMemberForMachineCalled: 0, // skipped + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + wantMachineAnnotations: map[string]map[string]string{ + deletingMachineWithKCPPreTerminateHook.Name: nil, // pre-terminate hook has been removed + }, + }, + { + name: "Skip forward etcd leadership, skip remove member and remove pre-terminate hook if > 1 CP Machines && Etcd is *not* managed", + controlPlane: &internal.ControlPlane{ + Cluster: cluster, + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Version: "v1.31.0", + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{ + External: &bootstrapv1.ExternalEtcd{ + Endpoints: []string{"1.2.3.4"}, + }, + }, + }, + }, + }, + }, + Machines: collections.Machines{ + machine.Name: machine, // Leadership will be forwarded to this Machine. + deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { + m := deletingMachineWithKCPPreTerminateHook.DeepCopy() + conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") + return m + }(), + }, + }, + wantForwardEtcdLeadershipCalled: 0, // skipped + wantRemoveEtcdMemberForMachineCalled: 0, // skipped + wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, + wantMachineAnnotations: map[string]map[string]string{ + machine.Name: machine.Annotations, // unchanged + deletingMachineWithKCPPreTerminateHook.Name: nil, // pre-terminate hook has been removed + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + objs := []client.Object{} + for _, m := range tt.controlPlane.Machines { + objs = append(objs, m) + } + fakeClient := fake.NewClientBuilder().WithObjects(objs...).Build() + + r := &KubeadmControlPlaneReconciler{ + Client: fakeClient, + } + + workloadCluster := fakeWorkloadCluster{} + tt.controlPlane.InjectTestManagementCluster(&fakeManagementCluster{ + Workload: &workloadCluster, + }) + + res, err := r.reconcilePreTerminateHook(ctx, tt.controlPlane) + if tt.wantErr != "" { + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(Equal(err.Error())) + } else { + g.Expect(err).ToNot(HaveOccurred()) + } + g.Expect(res).To(Equal(tt.wantResult)) + + g.Expect(workloadCluster.forwardEtcdLeadershipCalled).To(Equal(tt.wantForwardEtcdLeadershipCalled)) + g.Expect(workloadCluster.removeEtcdMemberForMachineCalled).To(Equal(tt.wantRemoveEtcdMemberForMachineCalled)) + + machineList := &clusterv1.MachineList{} + g.Expect(fakeClient.List(ctx, machineList)).To(Succeed()) + g.Expect(machineList.Items).To(HaveLen(len(tt.wantMachineAnnotations))) + for _, machine := range machineList.Items { + g.Expect(machine.Annotations).To(BeComparableTo(tt.wantMachineAnnotations[machine.Name]), "Unexpected annotations for Machine %s", machine.Name) + } + }) + } +} + +func TestKubeadmControlPlaneReconciler_machineHasOtherPreTerminateHooks(t *testing.T) { + tests := []struct { + name string + annotations map[string]string + want bool + }{ + { + name: "only KCP pre-terminate hook", + annotations: map[string]string{ + controlplanev1.PreTerminateHookCleanupAnnotation: "", + "some-other-annotation": "", + }, + want: false, + }, + { + name: "KCP & additional pre-terminate hooks", + annotations: map[string]string{ + controlplanev1.PreTerminateHookCleanupAnnotation: "", + "some-other-annotation": "", + clusterv1.PreTerminateDeleteHookAnnotationPrefix + "test": "", + clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/test": "", + }, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + m := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: tt.annotations, + }, + } + g.Expect(machineHasOtherPreTerminateHooks(m)).To(Equal(tt.want)) + }) + } +} + func TestKubeadmControlPlaneReconciler_updateCoreDNS(t *testing.T) { // TODO: (wfernandes) This test could use some refactor love. @@ -2122,7 +2484,7 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), @@ -2198,7 +2560,7 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), } @@ -2256,7 +2618,7 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), } @@ -2294,7 +2656,7 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) { SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: fakeClient}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), } diff --git a/controlplane/kubeadm/internal/controllers/fakes_test.go b/controlplane/kubeadm/internal/controllers/fakes_test.go index cf9fcbafe66e..6bf8ff1cb826 100644 --- a/controlplane/kubeadm/internal/controllers/fakes_test.go +++ b/controlplane/kubeadm/internal/controllers/fakes_test.go @@ -36,7 +36,7 @@ type fakeManagementCluster struct { Management *internal.Management Machines collections.Machines MachinePools *expv1.MachinePoolList - Workload fakeWorkloadCluster + Workload *fakeWorkloadCluster Reader client.Reader } @@ -71,68 +71,73 @@ type fakeWorkloadCluster struct { Status internal.ClusterStatus EtcdMembersResult []string APIServerCertificateExpiry *time.Time + + forwardEtcdLeadershipCalled int + removeEtcdMemberForMachineCalled int } -func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { +func (f *fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error { + f.forwardEtcdLeadershipCalled++ if leaderCandidate == nil { return errors.New("leaderCandidate is nil") } return nil } -func (f fakeWorkloadCluster) ReconcileEtcdMembers(_ context.Context, _ []string, _ semver.Version) ([]string, error) { +func (f *fakeWorkloadCluster) ReconcileEtcdMembers(_ context.Context, _ []string, _ semver.Version) ([]string, error) { return nil, nil } -func (f fakeWorkloadCluster) ClusterStatus(_ context.Context) (internal.ClusterStatus, error) { +func (f *fakeWorkloadCluster) ClusterStatus(_ context.Context) (internal.ClusterStatus, error) { return f.Status, nil } -func (f fakeWorkloadCluster) GetAPIServerCertificateExpiry(_ context.Context, _ *bootstrapv1.KubeadmConfig, _ string) (*time.Time, error) { +func (f *fakeWorkloadCluster) GetAPIServerCertificateExpiry(_ context.Context, _ *bootstrapv1.KubeadmConfig, _ string) (*time.Time, error) { return f.APIServerCertificateExpiry, nil } -func (f fakeWorkloadCluster) AllowBootstrapTokensToGetNodes(_ context.Context) error { +func (f *fakeWorkloadCluster) AllowBootstrapTokensToGetNodes(_ context.Context) error { return nil } -func (f fakeWorkloadCluster) AllowClusterAdminPermissions(_ context.Context, _ semver.Version) error { +func (f *fakeWorkloadCluster) AllowClusterAdminPermissions(_ context.Context, _ semver.Version) error { return nil } -func (f fakeWorkloadCluster) ReconcileKubeletRBACRole(_ context.Context, _ semver.Version) error { +func (f *fakeWorkloadCluster) ReconcileKubeletRBACRole(_ context.Context, _ semver.Version) error { return nil } -func (f fakeWorkloadCluster) ReconcileKubeletRBACBinding(_ context.Context, _ semver.Version) error { +func (f *fakeWorkloadCluster) ReconcileKubeletRBACBinding(_ context.Context, _ semver.Version) error { return nil } -func (f fakeWorkloadCluster) UpdateKubernetesVersionInKubeadmConfigMap(semver.Version) func(*bootstrapv1.ClusterConfiguration) { +func (f *fakeWorkloadCluster) UpdateKubernetesVersionInKubeadmConfigMap(semver.Version) func(*bootstrapv1.ClusterConfiguration) { return nil } -func (f fakeWorkloadCluster) UpdateEtcdLocalInKubeadmConfigMap(*bootstrapv1.LocalEtcd) func(*bootstrapv1.ClusterConfiguration) { +func (f *fakeWorkloadCluster) UpdateEtcdLocalInKubeadmConfigMap(*bootstrapv1.LocalEtcd) func(*bootstrapv1.ClusterConfiguration) { return nil } -func (f fakeWorkloadCluster) UpdateKubeletConfigMap(_ context.Context, _ semver.Version) error { +func (f *fakeWorkloadCluster) UpdateKubeletConfigMap(_ context.Context, _ semver.Version) error { return nil } -func (f fakeWorkloadCluster) RemoveEtcdMemberForMachine(_ context.Context, _ *clusterv1.Machine) error { +func (f *fakeWorkloadCluster) RemoveEtcdMemberForMachine(_ context.Context, _ *clusterv1.Machine) error { + f.removeEtcdMemberForMachineCalled++ return nil } -func (f fakeWorkloadCluster) RemoveMachineFromKubeadmConfigMap(_ context.Context, _ *clusterv1.Machine, _ semver.Version) error { +func (f *fakeWorkloadCluster) RemoveMachineFromKubeadmConfigMap(_ context.Context, _ *clusterv1.Machine, _ semver.Version) error { return nil } -func (f fakeWorkloadCluster) EtcdMembers(_ context.Context) ([]string, error) { +func (f *fakeWorkloadCluster) EtcdMembers(_ context.Context) ([]string, error) { return f.EtcdMembersResult, nil } -func (f fakeWorkloadCluster) UpdateClusterConfiguration(context.Context, semver.Version, ...func(*bootstrapv1.ClusterConfiguration)) error { +func (f *fakeWorkloadCluster) UpdateClusterConfiguration(context.Context, semver.Version, ...func(*bootstrapv1.ClusterConfiguration)) error { return nil } diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index ab4abc7bd220..e156b42619e0 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -254,7 +254,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -305,7 +305,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -363,7 +363,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -419,7 +419,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -593,7 +593,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -637,7 +637,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -681,7 +681,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -731,7 +731,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -768,7 +768,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { // Reconcile unhealthy replacements for m1. r.managementCluster = &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(collections.FromMachines(mi)), }, } @@ -820,7 +820,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -872,7 +872,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -924,7 +924,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -977,7 +977,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1030,7 +1030,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1083,7 +1083,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1127,7 +1127,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1163,7 +1163,7 @@ func TestReconcileUnhealthyMachines(t *testing.T) { // Reconcile unhealthy replacements for m1. r.managementCluster = &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(collections.FromMachines(mi, m2, m3)), }, } @@ -1232,7 +1232,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1268,7 +1268,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { controlPlane.Machines = collections.FromMachines(m2) r.managementCluster = &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, } @@ -1341,7 +1341,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1378,7 +1378,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { controlPlane.Machines = collections.FromMachines(m1, m3) r.managementCluster = &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, } @@ -1453,7 +1453,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1485,7 +1485,7 @@ func TestReconcileUnhealthyMachinesSequences(t *testing.T) { controlPlane.Machines = collections.FromMachines(m1, m3) r.managementCluster = &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, } @@ -1525,7 +1525,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1557,7 +1557,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1595,7 +1595,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: members, }, }, @@ -1626,7 +1626,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1658,7 +1658,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1697,7 +1697,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: members, }, }, @@ -1729,7 +1729,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1763,7 +1763,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1797,7 +1797,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1833,7 +1833,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, @@ -1869,7 +1869,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) { Client: env.GetClient(), recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ EtcdMembersResult: nodes(controlPlane.Machines), }, }, diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index 00f0cbb686fd..8fc19b072c77 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -70,7 +70,7 @@ func TestKubeadmControlPlaneReconciler_initializeControlPlane(t *testing.T) { recorder: record.NewFakeRecorder(32), managementClusterUncached: &fakeManagementCluster{ Management: &internal.Management{Client: env}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, } controlPlane := &internal.ControlPlane{ @@ -139,7 +139,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { fmc := &fakeManagementCluster{ Machines: collections.New(), - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } for i := range 2 { @@ -214,7 +214,7 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) { fmc := &fakeManagementCluster{ Machines: beforeMachines.DeepCopy(), - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, } r := &KubeadmControlPlaneReconciler{ @@ -265,7 +265,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. Client: fakeClient, SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, } @@ -308,7 +308,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. Client: fakeClient, SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, } @@ -350,7 +350,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. Client: fakeClient, SecretCachingClient: fakeClient, managementCluster: &fakeManagementCluster{ - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, } diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index ab5d5a2e2a49..7d08a6e6181d 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -75,7 +75,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) { Client: fakeClient, managementCluster: &fakeManagementCluster{ Machines: map[string]*clusterv1.Machine{}, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), } @@ -147,7 +147,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesNotReady(t *testin Client: fakeClient, managementCluster: &fakeManagementCluster{ Machines: machines, - Workload: fakeWorkloadCluster{}, + Workload: &fakeWorkloadCluster{}, }, recorder: record.NewFakeRecorder(32), } @@ -220,7 +220,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusAllMachinesReady(t *testing.T Client: fakeClient, managementCluster: &fakeManagementCluster{ Machines: machines, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{ Nodes: 3, ReadyNodes: 3, @@ -302,7 +302,7 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing Client: fakeClient, managementCluster: &fakeManagementCluster{ Machines: machines, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{ Nodes: 5, ReadyNodes: 1, @@ -383,7 +383,7 @@ func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAr Client: fakeClient, managementCluster: &fakeManagementCluster{ Machines: machines, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{ Nodes: 0, ReadyNodes: 0, diff --git a/controlplane/kubeadm/internal/controllers/upgrade_test.go b/controlplane/kubeadm/internal/controllers/upgrade_test.go index b2f60094e844..6c4f39bee6ae 100644 --- a/controlplane/kubeadm/internal/controllers/upgrade_test.go +++ b/controlplane/kubeadm/internal/controllers/upgrade_test.go @@ -84,13 +84,13 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { recorder: record.NewFakeRecorder(32), managementCluster: &fakeManagementCluster{ Management: &internal.Management{Client: env}, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{Nodes: 1}, }, }, managementClusterUncached: &fakeManagementCluster{ Management: &internal.Management{Client: env}, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{Nodes: 1}, }, }, @@ -192,7 +192,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleDown(t *testing.T) { fmc := &fakeManagementCluster{ Machines: collections.Machines{}, - Workload: fakeWorkloadCluster{ + Workload: &fakeWorkloadCluster{ Status: internal.ClusterStatus{Nodes: 3}, }, } From 4a3af971f5b699f5be1ede07664a88fefea8b619 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 5 Sep 2024 15:52:52 +0200 Subject: [PATCH 4/5] Add unit tests for reconcilePreTerminateHook - fixups --- .../kubeadm/internal/controllers/controller_test.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index b645c0590c41..bbd11dcfb5de 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -1917,13 +1917,15 @@ func TestKubeadmControlPlaneReconciler_reconcilePreTerminateHook(t *testing.T) { }, }, Machines: collections.Machines{ - deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook, + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook, + deletingMachineWithKCPAndOtherPreTerminateHooksOld.Name: deletingMachineWithKCPAndOtherPreTerminateHooksOld, }, }, wantResult: ctrl.Result{RequeueAfter: deleteRequeueAfter}, // Annotation are unchanged. wantMachineAnnotations: map[string]map[string]string{ - deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + deletingMachineWithKCPPreTerminateHook.Name: deletingMachineWithKCPPreTerminateHook.Annotations, + deletingMachineWithKCPAndOtherPreTerminateHooksOld.Name: deletingMachineWithKCPAndOtherPreTerminateHooksOld.Annotations, }, }, { @@ -2081,7 +2083,7 @@ func TestKubeadmControlPlaneReconciler_reconcilePreTerminateHook(t *testing.T) { ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ Etcd: bootstrapv1.Etcd{ External: &bootstrapv1.ExternalEtcd{ - Endpoints: []string{"1.2.3.4"}, + Endpoints: []string{"1.2.3.4"}, // Etcd is not managed by KCP }, }, }, @@ -2089,7 +2091,7 @@ func TestKubeadmControlPlaneReconciler_reconcilePreTerminateHook(t *testing.T) { }, }, Machines: collections.Machines{ - machine.Name: machine, // Leadership will be forwarded to this Machine. + machine.Name: machine, deletingMachineWithKCPPreTerminateHook.Name: func() *clusterv1.Machine { m := deletingMachineWithKCPPreTerminateHook.DeepCopy() conditions.MarkFalse(m, clusterv1.PreTerminateDeleteHookSucceededCondition, clusterv1.WaitingExternalHookReason, clusterv1.ConditionSeverityInfo, "some message") From 8ed4065e8537e906b01efa21da0665139ba4b0bc Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Thu, 5 Sep 2024 16:07:24 +0200 Subject: [PATCH 5/5] fixup --- .../internal/controllers/controller.go | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 2dd618d220fb..bdcc4e3e36af 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -579,11 +579,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con // to forward etcd leadership without any member left after we went through the Machine deletion. // Also in this case the reconcileDelete code of the Machine controller won't execute Node drain // and wait for volume detach. - log.Info("Removing pre-terminate hook from control plane machine") - deletingMachineOriginal := machineToDelete.DeepCopy() - delete(machineToDelete.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation) - if err := r.Client.Patch(ctx, machineToDelete, client.MergeFrom(deletingMachineOriginal)); err != nil { - errs = append(errs, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(machineToDelete))) + if err := r.removePreTerminateHookAnnotationFromMachine(ctx, machineToDelete); err != nil { + errs = append(errs, err) continue } @@ -602,6 +599,18 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil } +func (r *KubeadmControlPlaneReconciler) removePreTerminateHookAnnotationFromMachine(ctx context.Context, machine *clusterv1.Machine) error { + log := ctrl.LoggerFrom(ctx) + log.Info("Removing pre-terminate hook from control plane Machine") + + machineOriginal := machine.DeepCopy() + delete(machine.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation) + if err := r.Client.Patch(ctx, machine, client.MergeFrom(machineOriginal)); err != nil { + return errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(machine)) + } + return nil +} + // ClusterToKubeadmControlPlane is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation // for KubeadmControlPlane based on updates to a Cluster. func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(_ context.Context, o client.Object) []ctrl.Request { @@ -882,11 +891,8 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co } } - log.Info("Removing pre-terminate hook from control plane Machine") - deletingMachineOriginal := deletingMachine.DeepCopy() - delete(deletingMachine.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation) - if err := r.Client.Patch(ctx, deletingMachine, client.MergeFrom(deletingMachineOriginal)); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(deletingMachine)) + if err := r.removePreTerminateHookAnnotationFromMachine(ctx, deletingMachine); err != nil { + return ctrl.Result{}, err } log.Info("Waiting for Machines to be deleted", "machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", "))