Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-1.9] 🌱 Handle "waiting for completion" in KCP, MD, MS and Machine conditions #11825

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,9 @@ func aggregateStaleMachines(machines collections.Machines) string {
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons.Insert("Pod eviction errors")
}
if strings.Contains(deletingCondition.Message, "waiting for completion") {
delayReasons.Insert("Pods not completed yet")
}
}
}
}
Expand All @@ -889,7 +892,7 @@ func aggregateStaleMachines(machines collections.Machines) string {
message += "in deletion since more than 15m"
if len(delayReasons) > 0 {
reasonList := []string{}
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors", "Pods not completed yet"} {
if delayReasons.Has(r) {
reasonList = append(reasonList, r)
}
Expand Down
25 changes: 23 additions & 2 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,28 @@ func Test_setScalingDownCondition(t *testing.T) {
Status: controlplanev1.KubeadmControlPlaneStatus{Replicas: 3},
},
Machines: collections.FromMachines(
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1", DeletionTimestamp: ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)})}},
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1", DeletionTimestamp: ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)})},
Status: clusterv1.MachineStatus{
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason,
Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z):
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
* Pod pod-9-wait-completed: waiting for completion
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
},
},
},
Deletion: &clusterv1.MachineDeletionStatus{
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
},
},
},
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}},
&clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}},
),
Expand All @@ -477,7 +498,7 @@ func Test_setScalingDownCondition(t *testing.T) {
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason,
Message: "Scaling down from 3 to 1 replicas is blocked because:\n" +
"* Machine m1 is in deletion since more than 15m",
"* Machine m1 is in deletion since more than 15m, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors, Pods not completed yet",
},
},
{
Expand Down
3 changes: 3 additions & 0 deletions internal/controllers/machine/machine_controller_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,9 @@ func calculateDeletingConditionForSummary(machine *clusterv1.Machine) v1beta2con
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons = append(delayReasons, "Pod eviction errors")
}
if strings.Contains(deletingCondition.Message, "waiting for completion") {
delayReasons = append(delayReasons, "Pods not completed yet")
}
if len(delayReasons) > 0 {
msg += fmt.Sprintf(", delay likely due to %s", strings.Join(delayReasons, ", "))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1715,6 +1715,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
* Pod pod-9-wait-completed: waiting for completion
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
},
},
Expand All @@ -1733,7 +1734,7 @@ After above Pods have been removed from the Node, the following Pods will be evi
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingV1Beta2Reason,
Message: "Machine deletion in progress since more than 15m, stage: DrainingNode, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors",
Message: "Machine deletion in progress since more than 15m, stage: DrainingNode, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors, Pods not completed yet",
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,9 @@ func aggregateStaleMachines(machines collections.Machines) string {
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons.Insert("Pod eviction errors")
}
if strings.Contains(deletingCondition.Message, "waiting for completion") {
delayReasons.Insert("Pods not completed yet")
}
}
}
}
Expand All @@ -605,7 +608,7 @@ func aggregateStaleMachines(machines collections.Machines) string {
message += "in deletion since more than 15m"
if len(delayReasons) > 0 {
reasonList := []string{}
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors", "Pods not completed yet"} {
if delayReasons.Has(r) {
reasonList = append(reasonList, r)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -648,15 +648,36 @@ func Test_setScalingDownCondition(t *testing.T) {
},
machines: []*clusterv1.Machine{
fakeMachine("m1"),
fakeMachine("stale-machine-1", withStaleDeletion()),
fakeMachine("stale-machine-1", withStaleDeletion(), func(m *clusterv1.Machine) {
m.Status = clusterv1.MachineStatus{
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason,
Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z):
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
* Pod pod-9-wait-completed: waiting for completion
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
},
},
},
Deletion: &clusterv1.MachineDeletionStatus{
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
},
}
}),
},
getAndAdoptMachineSetsForDeploymentSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 2 to 1 replicas\n" +
"* Machine stale-machine-1 is in deletion since more than 15m",
"* Machine stale-machine-1 is in deletion since more than 15m, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors, Pods not completed yet",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,9 @@ func aggregateStaleMachines(machines []*clusterv1.Machine) string {
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons.Insert("Pod eviction errors")
}
if strings.Contains(deletingCondition.Message, "waiting for completion") {
delayReasons.Insert("Pods not completed yet")
}
}
}
}
Expand All @@ -481,7 +484,7 @@ func aggregateStaleMachines(machines []*clusterv1.Machine) string {
message += "in deletion since more than 15m"
if len(delayReasons) > 0 {
reasonList := []string{}
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors", "Pods not completed yet"} {
if delayReasons.Has(r) {
reasonList = append(reasonList, r)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,28 @@ func Test_setScalingDownCondition(t *testing.T) {
name: "scaling down with 1 stale machine",
ms: machineSet1Replica,
machines: []*clusterv1.Machine{
fakeMachine("stale-machine-1", withStaleDeletionTimestamp()),
fakeMachine("stale-machine-1", withStaleDeletionTimestamp(), func(m *clusterv1.Machine) {
m.Status = clusterv1.MachineStatus{
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason,
Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z):
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
* Pod pod-9-wait-completed: waiting for completion
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
},
},
},
Deletion: &clusterv1.MachineDeletionStatus{
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
},
}
}),
fakeMachine("machine-2"),
},
getAndAdoptMachinesForMachineSetSucceeded: true,
Expand All @@ -429,7 +450,7 @@ func Test_setScalingDownCondition(t *testing.T) {
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingDownV1Beta2Reason,
Message: "Scaling down from 2 to 1 replicas\n" +
"* Machine stale-machine-1 is in deletion since more than 15m",
"* Machine stale-machine-1 is in deletion since more than 15m, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors, Pods not completed yet",
},
},
{
Expand Down