diff --git a/Makefile b/Makefile index 945462337d44..b4698bf7ae91 100644 --- a/Makefile +++ b/Makefile @@ -600,7 +600,6 @@ generate-e2e-templates-main: $(KUSTOMIZE) echo "---" >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption/step2 --load-restrictor LoadRestrictionsNone >> $(DOCKER_TEMPLATES)/main/cluster-template-kcp-adoption.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml - $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-node-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-node-drain.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index abe9bf6e3b12..5544291360dd 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -317,8 +317,8 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr t.Machine, corev1.EventTypeNormal, EventDetectedUnhealthy, - "Machine %v has unhealthy node %v", - t.string(), + "Machine %s has unhealthy Node %s", + klog.KObj(t.Machine), t.nodeName(), ) nextCheckTimes = append(nextCheckTimes, nextCheck) diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml index 242055c0d64a..fce2cb55dd18 100644 --- a/test/e2e/config/docker.yaml +++ b/test/e2e/config/docker.yaml @@ -347,7 +347,6 @@ providers: - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml" - - sourcePath: "../data/infrastructure-docker/main/cluster-template-node-drain.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml" @@ -408,7 +407,6 @@ variables: CNI: "./data/cni/kindnet/kindnet.yaml" KUBETEST_CONFIGURATION: "./data/kubetest/conformance.yaml" AUTOSCALER_WORKLOAD: "./data/autoscaler/autoscaler-to-workload-workload.yaml" - NODE_DRAIN_TIMEOUT: "60s" # Enabling the feature flags by setting the env variables. # Note: EXP_CLUSTER_RESOURCE_SET & EXP_MACHINE_POOL are enabled per default with CAPI v1.7.0. # We still have to enable them here for clusterctl upgrade tests that use older versions. diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml deleted file mode 100644 index 91a5a7f4728f..000000000000 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/cluster-with-kcp.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# KubeadmControlPlane referenced by the Cluster object with -# - the label kcp-adoption.step2, because it should be created in the second step of the kcp-adoption test. -kind: KubeadmControlPlane -apiVersion: controlplane.cluster.x-k8s.io/v1beta1 -metadata: - name: "${CLUSTER_NAME}-control-plane" -spec: - machineTemplate: - nodeDrainTimeout: ${NODE_DRAIN_TIMEOUT} diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml deleted file mode 100644 index a2f9bea098b4..000000000000 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -resources: -- ../bases/crs.yaml -- ../bases/md.yaml -- ../bases/cluster-with-kcp.yaml - -patches: -- path: md.yaml -- path: cluster-with-kcp.yaml diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml deleted file mode 100644 index bc4577762f31..000000000000 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-node-drain/md.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: cluster.x-k8s.io/v1beta1 -kind: MachineDeployment -metadata: - name: "${CLUSTER_NAME}-md-0" -spec: - template: - spec: - nodeDrainTimeout: "${NODE_DRAIN_TIMEOUT}" diff --git a/test/e2e/node_drain_timeout.go b/test/e2e/node_drain_timeout.go index 51337e860431..fa4a8f353011 100644 --- a/test/e2e/node_drain_timeout.go +++ b/test/e2e/node_drain_timeout.go @@ -25,15 +25,18 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/clusterctl" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" ) // NodeDrainTimeoutSpecInput is the input for NodeDrainTimeoutSpec. @@ -52,10 +55,8 @@ type NodeDrainTimeoutSpecInput struct { // able to identify the default. InfrastructureProvider *string - // Flavor, if specified, must refer to a template that contains - // a KubeadmControlPlane resource with spec.machineTemplate.nodeDrainTimeout - // configured and a MachineDeployment resource that has - // spec.template.spec.nodeDrainTimeout configured. + // Flavor, if specified, must refer to a template that uses a Cluster with ClusterClass. + // The cluster must use a KubeadmControlPlane and a MachineDeployment. // If not specified, "node-drain" is used. Flavor *string @@ -64,15 +65,33 @@ type NodeDrainTimeoutSpecInput struct { PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string) } +// NodeDrainTimeoutSpec goes through the following steps: +// * Create cluster with 3 CP & 1 worker Machine +// * Ensure Node label is set & NodeDrainTimeout is set to 0 (wait forever) +// * Deploy Deployment with unevictable Pods on CP & MD Nodes +// * Deploy Deployment with evictable Pods with finalizer on CP & MD Nodes +// * Trigger Scale down to 1 CP and 0 MD Machines +// * Verify Node drains for control plane and MachineDeployment Machines are blocked (PDBs & Pods with finalizer) +// - DrainingSucceeded conditions should: +// - show 1 evicted Pod with deletionTimestamp (still exists because of finalizer) +// - show 1 Pod which could not be evicted because of PDB +// - Verify the evicted Pod has terminated (i.e. succeeded) and it was evicted +// +// * Unblock deletion of evicted Pods by removing the finalizer +// * Verify Node drains for control plane and MachineDeployment Machines are blocked (only PDBs) +// - DrainingSucceeded conditions should: +// - not contain any Pods with deletionTimestamp +// - show 1 Pod which could not be evicted because of PDB +// +// * Set NodeDrainTimeout to 1s to unblock drain +// * Verify scale down succeeded because Node drains were unblocked. func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeoutSpecInput) { var ( - specName = "node-drain" - input NodeDrainTimeoutSpecInput - namespace *corev1.Namespace - cancelWatches context.CancelFunc - clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult - machineDeployments []*clusterv1.MachineDeployment - controlplane *controlplanev1.KubeadmControlPlane + specName = "node-drain" + input NodeDrainTimeoutSpecInput + namespace *corev1.Namespace + cancelWatches context.CancelFunc + clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult ) BeforeEach(func() { @@ -97,6 +116,7 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo if input.InfrastructureProvider != nil { infrastructureProvider = *input.InfrastructureProvider } + controlPlaneReplicas := 3 clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ ClusterProxy: input.BootstrapClusterProxy, @@ -118,52 +138,275 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), }, clusterResources) cluster := clusterResources.Cluster - controlplane = clusterResources.ControlPlane - machineDeployments = clusterResources.MachineDeployments + controlplane := clusterResources.ControlPlane + machineDeployments := clusterResources.MachineDeployments Expect(machineDeployments[0].Spec.Replicas).To(Equal(ptr.To[int32](1))) - By("Add a deployment with unevictable pods and podDisruptionBudget to the workload cluster. The deployed pods cannot be evicted in the node draining process.") + // This label will be added to all Machines so we can later create Pods on the right Nodes. + nodeOwnerLabelKey := "owner.node.cluster.x-k8s.io" + + By("Ensure Node label is set & NodeDrainTimeout is set to 0 (wait forever) on ControlPlane and MachineDeployment topologies") + modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) { + topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)} + if topology.Metadata.Labels == nil { + topology.Metadata.Labels = map[string]string{} + } + topology.Metadata.Labels[nodeOwnerLabelKey] = "KubeadmControlPlane-" + controlplane.Name + }, + WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"), + }) + modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) { + topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)} + if topology.Metadata.Labels == nil { + topology.Metadata.Labels = map[string]string{} + } + for _, md := range machineDeployments { + if md.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] == topology.Name { + topology.Metadata.Labels[nodeOwnerLabelKey] = "MachineDeployment-" + md.Name + } + } + }, + WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), + }) + workloadClusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, cluster.Namespace, cluster.Name) + + By("Deploy Deployment with unevictable Pods on control plane Nodes.") + cpDeploymentAndPDBName := fmt.Sprintf("%s-%s", "unevictable-pod-cp", util.RandomString(3)) framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{ WorkloadClusterProxy: workloadClusterProxy, - DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)), - Namespace: namespace.Name + "-unevictable-workload", + ControlPlane: controlplane, + DeploymentName: cpDeploymentAndPDBName, + Namespace: "unevictable-workload", + NodeSelector: map[string]string{nodeOwnerLabelKey: "KubeadmControlPlane-" + controlplane.Name}, WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"), }) - - By("Scale the machinedeployment down to zero. If we didn't have the NodeDrainTimeout duration, the node drain process would block this operator.") - // Because all the machines of a machinedeployment can be deleted at the same time, so we only prepare the interval for 1 replica. - nodeDrainTimeoutMachineDeploymentInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), machineDeployments[0].Spec.Template.Spec.NodeDrainTimeout, 1) + By("Deploy Deployment with unevictable Pods on MachineDeployment Nodes.") + mdDeploymentAndPDBNames := map[string]string{} for _, md := range machineDeployments { - framework.ScaleAndWaitMachineDeployment(ctx, framework.ScaleAndWaitMachineDeploymentInput{ - ClusterProxy: input.BootstrapClusterProxy, - Cluster: cluster, - MachineDeployment: md, - WaitForMachineDeployments: nodeDrainTimeoutMachineDeploymentInterval, - Replicas: 0, + mdDeploymentAndPDBNames[md.Name] = fmt.Sprintf("%s-%s", "unevictable-pod-md", util.RandomString(3)) + framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{ + WorkloadClusterProxy: workloadClusterProxy, + MachineDeployment: md, + DeploymentName: mdDeploymentAndPDBNames[md.Name], + Namespace: "unevictable-workload", + NodeSelector: map[string]string{nodeOwnerLabelKey: "MachineDeployment-" + md.Name}, + WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"), }) } - By("Deploy deployment with unevictable pods on control plane nodes.") - framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{ - WorkloadClusterProxy: workloadClusterProxy, - ControlPlane: controlplane, - DeploymentName: fmt.Sprintf("%s-%s", "unevictable-pod", util.RandomString(3)), - Namespace: namespace.Name + "-unevictable-workload", + By("Deploy Deployment with evictable Pods with finalizer on control plane Nodes.") + cpDeploymentWithFinalizerName := fmt.Sprintf("%s-%s", "evictable-pod-cp", util.RandomString(3)) + framework.DeployEvictablePod(ctx, framework.DeployEvictablePodInput{ + WorkloadClusterProxy: workloadClusterProxy, + ControlPlane: controlplane, + DeploymentName: cpDeploymentWithFinalizerName, + Namespace: "evictable-workload", + NodeSelector: map[string]string{nodeOwnerLabelKey: "KubeadmControlPlane-" + controlplane.Name}, + ModifyDeployment: func(deployment *appsv1.Deployment) { + deployment.Spec.Template.ObjectMeta.Finalizers = []string{"test.cluster.x-k8s.io/block"} + }, WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"), }) + By("Deploy Deployment with evictable Pods with finalizer on MachineDeployment Nodes.") + mdDeploymentWithFinalizerName := map[string]string{} + for _, md := range machineDeployments { + mdDeploymentWithFinalizerName[md.Name] = fmt.Sprintf("%s-%s", "evictable-pod-md", util.RandomString(3)) + framework.DeployEvictablePod(ctx, framework.DeployEvictablePodInput{ + WorkloadClusterProxy: workloadClusterProxy, + MachineDeployment: md, + DeploymentName: mdDeploymentWithFinalizerName[md.Name], + Namespace: "evictable-workload", + NodeSelector: map[string]string{nodeOwnerLabelKey: "MachineDeployment-" + md.Name}, + ModifyDeployment: func(deployment *appsv1.Deployment) { + deployment.Spec.Template.ObjectMeta.Finalizers = []string{"test.cluster.x-k8s.io/block"} + }, + WaitForDeploymentAvailableInterval: input.E2EConfig.GetIntervals(specName, "wait-deployment-available"), + }) + } + + By("Trigger scale down the control plane to 1 and MachineDeployments to 0.") + modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) { + topology.Replicas = ptr.To[int32](1) + }, + WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"), + }) + modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) { + topology.Replicas = ptr.To[int32](0) + }, + WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), + }) - By("Scale down the controlplane of the workload cluster and make sure that nodes running workload can be deleted even the draining process is blocked.") - // When we scale down the KCP, controlplane machines are by default deleted one by one, so it requires more time. - nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), controlplane.Spec.MachineTemplate.NodeDrainTimeout, controlPlaneReplicas) - framework.ScaleAndWaitControlPlane(ctx, framework.ScaleAndWaitControlPlaneInput{ - ClusterProxy: input.BootstrapClusterProxy, - Cluster: cluster, - ControlPlane: controlplane, - Replicas: 1, - WaitForControlPlane: nodeDrainTimeoutKCPInterval, + By("Verify Node drains for control plane and MachineDeployment Machines are blocked (PDBs & Pods with finalizer") + var drainedCPMachine *clusterv1.Machine + var evictedCPPod *corev1.Pod + Eventually(func(g Gomega) { + controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{ + Lister: input.BootstrapClusterProxy.GetClient(), + ClusterName: cluster.Name, + Namespace: cluster.Namespace, + }) + var condition *clusterv1.Condition + for _, machine := range controlPlaneMachines { + condition = conditions.Get(&machine, clusterv1.DrainingSucceededCondition) + if condition != nil { + // We only expect to find the condition on one Machine (as KCP will only try to drain one Machine at a time) + drainedCPMachine = &machine + break + } + } + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(corev1.ConditionFalse)) + // The evictable Pod should be evicted. It still blocks the drain because of the finalizer, otherwise the Pod would be gone already. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Pods with deletionTimestamp that still exist: evictable-workload/%s", cpDeploymentWithFinalizerName))) + // The unevictable Pod should not be evicted because of the PDB. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", cpDeploymentAndPDBName))) + + // Verify evictable Pod was evicted and terminated (i.e. phase is succeeded) + evictedPods := &corev1.PodList{} + g.Expect(workloadClusterProxy.GetClient().List(ctx, evictedPods, + client.InNamespace("evictable-workload"), + client.MatchingLabels{"deployment": cpDeploymentWithFinalizerName}, + client.MatchingFieldsSelector{Selector: fields.OneTermEqualSelector("spec.nodeName", drainedCPMachine.Status.NodeRef.Name)}, + )).To(Succeed()) + g.Expect(evictedPods.Items).To(HaveLen(1)) + evictedCPPod = &evictedPods.Items[0] + verifyPodEvictedAndSucceeded(g, evictedCPPod) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + drainedMDMachines := map[string]*clusterv1.Machine{} + evictedMDPods := map[string]*corev1.Pod{} + for _, md := range machineDeployments { + Eventually(func(g Gomega) { + machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{ + Lister: input.BootstrapClusterProxy.GetClient(), + ClusterName: cluster.Name, + Namespace: cluster.Namespace, + MachineDeployment: *md, + }) + g.Expect(machines).To(HaveLen(1)) + drainedMDMachines[md.Name] = &machines[0] + + condition := conditions.Get(&machines[0], clusterv1.DrainingSucceededCondition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(corev1.ConditionFalse)) + // The evictable Pod should be evicted. It still blocks the drain because of the finalizer, otherwise the Pod would be gone already. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Pods with deletionTimestamp that still exist: evictable-workload/%s", mdDeploymentWithFinalizerName[md.Name]))) + // The unevictable Pod should not be evicted because of the PDB. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", mdDeploymentAndPDBNames[md.Name]))) + + // Verify evictable Pod was evicted and terminated (i.e. phase is succeeded) + evictedPods := &corev1.PodList{} + g.Expect(workloadClusterProxy.GetClient().List(ctx, evictedPods, + client.InNamespace("evictable-workload"), + client.MatchingLabels{"deployment": mdDeploymentWithFinalizerName[md.Name]}, + client.MatchingFieldsSelector{Selector: fields.OneTermEqualSelector("spec.nodeName", machines[0].Status.NodeRef.Name)}, + )).To(Succeed()) + g.Expect(evictedPods.Items).To(HaveLen(1)) + evictedMDPods[md.Name] = &evictedPods.Items[0] + verifyPodEvictedAndSucceeded(g, evictedMDPods[md.Name]) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + } + + By("Unblock deletion of evicted Pods by removing the finalizer") + Eventually(func(g Gomega) { + g.Expect(workloadClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(evictedCPPod), evictedCPPod)).To(Succeed()) + originalPod := evictedCPPod.DeepCopy() + evictedCPPod.Finalizers = []string{} + g.Expect(workloadClusterProxy.GetClient().Patch(ctx, evictedCPPod, client.MergeFrom(originalPod))).To(Succeed()) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + for _, md := range machineDeployments { + Eventually(func(g Gomega) { + evictedMDPod := evictedMDPods[md.Name] + g.Expect(workloadClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(evictedMDPod), evictedMDPod)).To(Succeed()) + originalPod := evictedMDPod.DeepCopy() + evictedMDPod.Finalizers = []string{} + g.Expect(workloadClusterProxy.GetClient().Patch(ctx, evictedMDPod, client.MergeFrom(originalPod))).To(Succeed()) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + } + + By("Verify Node drains for control plane and MachineDeployment Machines are blocked (only PDBs") + Eventually(func(g Gomega) { + g.Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(drainedCPMachine), drainedCPMachine)).To(Succeed()) + + condition := conditions.Get(drainedCPMachine, clusterv1.DrainingSucceededCondition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(corev1.ConditionFalse)) + // The evictable Pod should be gone now. + g.Expect(condition.Message).ToNot(ContainSubstring("Pods with deletionTimestamp that still exist")) + // The unevictable Pod should still not be evicted because of the PDB. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", cpDeploymentAndPDBName))) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + for _, md := range machineDeployments { + Eventually(func(g Gomega) { + g.Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(drainedMDMachines[md.Name]), drainedMDMachines[md.Name])).To(Succeed()) + + condition := conditions.Get(drainedMDMachines[md.Name], clusterv1.DrainingSucceededCondition) + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Status).To(Equal(corev1.ConditionFalse)) + // The evictable Pod should be gone now. + g.Expect(condition.Message).ToNot(ContainSubstring("Pods with deletionTimestamp that still exist")) + // The unevictable Pod should still not be evicted because of the PDB. + g.Expect(condition.Message).To(ContainSubstring(fmt.Sprintf("Cannot evict pod as it would violate the pod's disruption budget. The disruption budget %s needs", mdDeploymentAndPDBNames[md.Name]))) + }, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed()) + } + + By("Set NodeDrainTimeout to 1s to unblock Node drain") + // Note: This also verifies that KCP & MachineDeployments are still propagating changes to NodeDrainTimeout down to + // Machines that already have a deletionTimestamp. + drainTimeout := &metav1.Duration{Duration: time.Duration(1) * time.Second} + modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) { + topology.NodeDrainTimeout = drainTimeout + }, + WaitForControlPlane: input.E2EConfig.GetIntervals(specName, "wait-control-plane"), + }) + modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{ + ClusterProxy: input.BootstrapClusterProxy, + Cluster: cluster, + ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) { + topology.NodeDrainTimeout = drainTimeout + }, + WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), }) + By("Verify scale down succeeded because Node drains were unblocked") + // When we scale down the KCP, controlplane machines are deleted one by one, so it requires more time + // MD Machine deletion is done in parallel and will be faster. + nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), drainTimeout, controlPlaneReplicas) + Eventually(func(g Gomega) { + // When all drains complete we only have 1 control plane & 0 MD replicas left. + controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{ + Lister: input.BootstrapClusterProxy.GetClient(), + ClusterName: cluster.Name, + Namespace: cluster.Namespace, + }) + g.Expect(controlPlaneMachines).To(HaveLen(1)) + + for _, md := range machineDeployments { + machines := framework.GetMachinesByMachineDeployments(ctx, framework.GetMachinesByMachineDeploymentsInput{ + Lister: input.BootstrapClusterProxy.GetClient(), + ClusterName: cluster.Name, + Namespace: cluster.Namespace, + MachineDeployment: *md, + }) + g.Expect(machines).To(BeEmpty()) + } + }, nodeDrainTimeoutKCPInterval...).Should(Succeed()) + By("PASSED!") }) @@ -173,6 +416,18 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo }) } +func verifyPodEvictedAndSucceeded(g Gomega, pod *corev1.Pod) { + g.Expect(pod.Status.Phase).To(Equal(corev1.PodSucceeded)) + podEvicted := false + for _, c := range pod.Status.Conditions { + if c.Type == corev1.DisruptionTarget && c.Reason == "EvictionByEvictionAPI" && c.Status == corev1.ConditionTrue { + podEvicted = true + break + } + } + g.Expect(podEvicted).To(BeTrue(), "Expected Pod to be evicted") +} + func getDrainAndDeleteInterval(deleteInterval []interface{}, drainTimeout *metav1.Duration, replicas int) []interface{} { deleteTimeout, err := time.ParseDuration(deleteInterval[0].(string)) Expect(err).ToNot(HaveOccurred()) diff --git a/test/e2e/node_drain_timeout_test.go b/test/e2e/node_drain_timeout_test.go index 408b9d44e0c5..882c9621ff5c 100644 --- a/test/e2e/node_drain_timeout_test.go +++ b/test/e2e/node_drain_timeout_test.go @@ -32,6 +32,7 @@ var _ = Describe("When testing node drain timeout", func() { BootstrapClusterProxy: bootstrapClusterProxy, ArtifactFolder: artifactFolder, SkipCleanup: skipCleanup, + Flavor: ptr.To("topology"), InfrastructureProvider: ptr.To("docker"), } }) diff --git a/test/framework/deployment_helpers.go b/test/framework/deployment_helpers.go index a2b0c8c141a8..93d4d7186efa 100644 --- a/test/framework/deployment_helpers.go +++ b/test/framework/deployment_helpers.go @@ -42,16 +42,14 @@ import ( "k8s.io/apimachinery/pkg/labels" kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/intstr" - utilversion "k8s.io/apimachinery/pkg/util/version" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" - "k8s.io/utils/ptr" toolscache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" . "sigs.k8s.io/cluster-api/test/framework/ginkgoextensions" "sigs.k8s.io/cluster-api/test/framework/internal/log" @@ -493,115 +491,200 @@ func WaitForDNSUpgrade(ctx context.Context, input WaitForDNSUpgradeInput, interv type DeployUnevictablePodInput struct { WorkloadClusterProxy ClusterProxy ControlPlane *controlplanev1.KubeadmControlPlane + MachineDeployment *clusterv1.MachineDeployment DeploymentName string Namespace string + NodeSelector map[string]string WaitForDeploymentAvailableInterval []interface{} } +// DeployUnevictablePod will deploy a Deployment on a ControlPlane or MachineDeployment. +// It will deploy one Pod replica to each Machine and then deploy a PDB to ensure none of the Pods can be evicted. func DeployUnevictablePod(ctx context.Context, input DeployUnevictablePodInput) { Expect(input.DeploymentName).ToNot(BeNil(), "Need a deployment name in DeployUnevictablePod") Expect(input.Namespace).ToNot(BeNil(), "Need a namespace in DeployUnevictablePod") Expect(input.WorkloadClusterProxy).ToNot(BeNil(), "Need a workloadClusterProxy in DeployUnevictablePod") + Expect((input.MachineDeployment == nil && input.ControlPlane != nil) || + (input.MachineDeployment != nil && input.ControlPlane == nil)).To(BeTrue(), "Either MachineDeployment or ControlPlane must be set in DeployUnevictablePod") EnsureNamespace(ctx, input.WorkloadClusterProxy.GetClient(), input.Namespace) - workloadDeployment := &appsv1.Deployment{ + workloadDeployment := generateDeployment(generateDeploymentInput{ + ControlPlane: input.ControlPlane, + MachineDeployment: input.MachineDeployment, + Name: input.DeploymentName, + Namespace: input.Namespace, + NodeSelector: input.NodeSelector, + }) + + workloadClient := input.WorkloadClusterProxy.GetClientSet() + + AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{ + Namespace: input.Namespace, + ClientSet: workloadClient, + Deployment: workloadDeployment, + }) + + budget := &policyv1.PodDisruptionBudget{ ObjectMeta: metav1.ObjectMeta{ Name: input.DeploymentName, Namespace: input.Namespace, }, + Spec: policyv1.PodDisruptionBudgetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "nonstop", + "deployment": input.DeploymentName, + }, + }, + // Setting MaxUnavailable to 0 means no Pods can be evicted / unavailable. + MaxUnavailable: &intstr.IntOrString{ + Type: intstr.Int, + IntVal: 0, + }, + }, + } + + AddPodDisruptionBudget(ctx, AddPodDisruptionBudgetInput{ + Namespace: input.Namespace, + ClientSet: workloadClient, + Budget: budget, + }) + + WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{ + Getter: input.WorkloadClusterProxy.GetClient(), + Deployment: workloadDeployment, + }, input.WaitForDeploymentAvailableInterval...) +} + +type DeployEvictablePodInput struct { + WorkloadClusterProxy ClusterProxy + ControlPlane *controlplanev1.KubeadmControlPlane + MachineDeployment *clusterv1.MachineDeployment + DeploymentName string + Namespace string + NodeSelector map[string]string + + ModifyDeployment func(deployment *appsv1.Deployment) + + WaitForDeploymentAvailableInterval []interface{} +} + +// DeployEvictablePod will deploy a Deployment on a ControlPlane or MachineDeployment. +// It will deploy one Pod replica to each Machine. +func DeployEvictablePod(ctx context.Context, input DeployEvictablePodInput) { + Expect(input.DeploymentName).ToNot(BeNil(), "Need a deployment name in DeployUnevictablePod") + Expect(input.Namespace).ToNot(BeNil(), "Need a namespace in DeployUnevictablePod") + Expect(input.WorkloadClusterProxy).ToNot(BeNil(), "Need a workloadClusterProxy in DeployUnevictablePod") + Expect((input.MachineDeployment == nil && input.ControlPlane != nil) || + (input.MachineDeployment != nil && input.ControlPlane == nil)).To(BeTrue(), "Either MachineDeployment or ControlPlane must be set in DeployUnevictablePod") + + EnsureNamespace(ctx, input.WorkloadClusterProxy.GetClient(), input.Namespace) + + workloadDeployment := generateDeployment(generateDeploymentInput{ + ControlPlane: input.ControlPlane, + MachineDeployment: input.MachineDeployment, + Name: input.DeploymentName, + Namespace: input.Namespace, + NodeSelector: input.NodeSelector, + }) + + input.ModifyDeployment(workloadDeployment) + + workloadClient := input.WorkloadClusterProxy.GetClientSet() + + AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{ + Namespace: input.Namespace, + ClientSet: workloadClient, + Deployment: workloadDeployment, + }) + + WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{ + Getter: input.WorkloadClusterProxy.GetClient(), + Deployment: workloadDeployment, + }, input.WaitForDeploymentAvailableInterval...) +} + +type generateDeploymentInput struct { + ControlPlane *controlplanev1.KubeadmControlPlane + MachineDeployment *clusterv1.MachineDeployment + Name string + Namespace string + NodeSelector map[string]string +} + +func generateDeployment(input generateDeploymentInput) *appsv1.Deployment { + workloadDeployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: input.Name, + Namespace: input.Namespace, + }, Spec: appsv1.DeploymentSpec{ - Replicas: ptr.To[int32](4), Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ - "app": "nonstop", + "app": "nonstop", + "deployment": input.Name, }, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ - "app": "nonstop", + "app": "nonstop", + "deployment": input.Name, }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "web", + Name: "main", Image: "registry.k8s.io/pause:3.10", }, }, + Affinity: &corev1.Affinity{ + // Make sure only 1 Pod of this Deployment can run on the same Node. + PodAntiAffinity: &corev1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "deployment", + Operator: "In", + Values: []string{input.Name}, + }, + }, + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, }, }, }, } - workloadClient := input.WorkloadClusterProxy.GetClientSet() if input.ControlPlane != nil { - var serverVersion *version.Info - Eventually(func() error { - var err error - serverVersion, err = workloadClient.ServerVersion() - return err - }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "failed to get server version") - - // Use the control-plane label for Kubernetes version >= v1.20.0. - if utilversion.MustParseGeneric(serverVersion.String()).AtLeast(utilversion.MustParseGeneric("v1.20.0")) { - workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleControlPlane: ""} - } else { - workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleOldControlPlane: ""} - } - + workloadDeployment.Spec.Template.Spec.NodeSelector = map[string]string{nodeRoleControlPlane: ""} workloadDeployment.Spec.Template.Spec.Tolerations = []corev1.Toleration{ - { - Key: nodeRoleOldControlPlane, - Effect: "NoSchedule", - }, { Key: nodeRoleControlPlane, Effect: "NoSchedule", }, } + workloadDeployment.Spec.Replicas = input.ControlPlane.Spec.Replicas } - AddDeploymentToWorkloadCluster(ctx, AddDeploymentToWorkloadClusterInput{ - Namespace: input.Namespace, - ClientSet: workloadClient, - Deployment: workloadDeployment, - }) - - budget := &policyv1.PodDisruptionBudget{ - TypeMeta: metav1.TypeMeta{ - Kind: "PodDisruptionBudget", - APIVersion: "policy/v1", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: input.DeploymentName, - Namespace: input.Namespace, - }, - Spec: policyv1.PodDisruptionBudgetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "nonstop", - }, - }, - MaxUnavailable: &intstr.IntOrString{ - Type: intstr.Int, - IntVal: 1, - StrVal: "1", - }, - }, + if input.MachineDeployment != nil { + workloadDeployment.Spec.Replicas = input.MachineDeployment.Spec.Replicas } - AddPodDisruptionBudget(ctx, AddPodDisruptionBudgetInput{ - Namespace: input.Namespace, - ClientSet: workloadClient, - Budget: budget, - }) + // Note: If set, the NodeSelector field overwrites the NodeSelector we set above for control plane nodes. + if input.NodeSelector != nil { + workloadDeployment.Spec.Template.Spec.NodeSelector = input.NodeSelector + } - WaitForDeploymentsAvailable(ctx, WaitForDeploymentsAvailableInput{ - Getter: input.WorkloadClusterProxy.GetClient(), - Deployment: workloadDeployment, - }, input.WaitForDeploymentAvailableInterval...) + return workloadDeployment } type AddDeploymentToWorkloadClusterInput struct {