From 78037648c860ac7e7454ba7b5ade4579e8b53c01 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Wed, 6 Nov 2024 13:50:28 +0100 Subject: [PATCH] Address comments --- .../api/v1beta1/v1beta2_condition_consts.go | 18 ++-- .../kubeadm/internal/control_plane.go | 4 +- .../kubeadm/internal/controllers/status.go | 86 ++++++++++--------- .../internal/controllers/status_test.go | 63 +++++++------- .../internal/workload_cluster_conditions.go | 42 +++++---- .../workload_cluster_conditions_test.go | 4 +- 6 files changed, 116 insertions(+), 101 deletions(-) diff --git a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go index ff97cdacc9af..5691cebba8b2 100644 --- a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go +++ b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go @@ -20,25 +20,33 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" // KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version. const ( - // KubeadmControlPlaneAvailableV1Beta2Condition is True if KubeadmControlPlane not delete, `CertificatesAvailable` is true, - // at least one Kubernetes API server, scheduler and controller manager control plane are healthy, + // KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane not deleted, `CertificatesAvailable` is true, + // at least one Machine with Kubernetes API server, scheduler and controller manager healthy, // and etcd has enough operational members to meet quorum requirements. + // More specifically, considering how kubeadm layouts components: + // - Kubernetes API server, scheduler and controller manager health is inferred by the status of + // the corresponding Pods hosted on each machine. + // - In case of managed etcd, also an healthy etcd Pod and an healthy etcd member must exist on the same + // machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control + // plane cannot be considered operational (if etcd is not operational on machine, most likely also API server, + // scheduler and controller manager on the same machine will be impacted). + // - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped. KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition // KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the // etcd cluster hosted on KubeadmControlPlane controlled machines. KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason - // KubeadmControlPlaneAvailableV1Beta2Reason surfaces when a Deployment is available. + // KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available. KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason - // KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when a Deployment is not available. + // KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available. KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason ) // KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version. const ( - // KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept + // KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept // requests. This information is usually used as a signal for starting all the provisioning operations that // depend on a functional API server, but do not require a full HA control plane to exist. KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized" diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 8ce2e710fae6..fc9a0cf6ee10 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -61,7 +61,9 @@ type ControlPlane struct { // EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below // comes from the same func. - // NOTE: Those info are computed on what we know, so we can reason about availability eve if with a certain degree of problems in the cluster + // NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP + // control plane one etcd member is down, those info are based on the answer collected from two members only). + // NOTE: Those info are specifically designed for computing KCP's Available condition. EtcdMembers []*etcd.Member EtcdMembersAgreeOnMemberList bool EtcdMembersAgreeOnClusterID bool diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index cdc9a172d4d6..91209035eb0d 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -424,7 +424,7 @@ func setRemediatingCondition(ctx context.Context, kcp *controlplanev1.KubeadmCon }) } -func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList bool, etcdMembersAgreeOnClusterID bool, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) { +func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) { if !kcp.Status.Initialized { v1beta2conditions.Set(kcp, metav1.Condition{ Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, @@ -435,50 +435,52 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl return } - if etcdIsManaged && etcdMembers == nil { - v1beta2conditions.Set(kcp, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, - Status: metav1.ConditionUnknown, - Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason, - Message: "Failed to get etcd members", - }) - return - } + if etcdIsManaged { + if etcdMembers == nil { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason, + Message: "Failed to get etcd members", + }) + return + } - if etcdIsManaged && !etcdMembersAgreeOnMemberList { - v1beta2conditions.Set(kcp, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, - Status: metav1.ConditionFalse, - Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "At least one etcd member reports a list of etcd members different than the list reported by other members", - }) - return - } + if !etcdMembersAgreeOnMemberList { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a list of etcd members different than the list reported by other members", + }) + return + } - if etcdIsManaged && !etcdMembersAgreeOnClusterID { - v1beta2conditions.Set(kcp, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, - Status: metav1.ConditionFalse, - Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members", - }) - return - } + if !etcdMembersAgreeOnClusterID { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members", + }) + return + } - if etcdIsManaged && !etcdMembersAndMachinesAreMatching { - v1beta2conditions.Set(kcp, metav1.Condition{ - Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, - Status: metav1.ConditionFalse, - Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "The list of etcd members does not match the list of Machines and Nodes", - }) - return + if !etcdMembersAndMachinesAreMatching { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "The list of etcd members does not match the list of Machines and Nodes", + }) + return + } } // Determine control plane availability looking at machines conditions, which at this stage are // already surfacing status from etcd member and all control plane pods hosted on every machine. - // Note: we intentionally use the number of etcd members for determine the etcd quorum because - // etcd members could not match with machines, e.g. while provisioning a new machine. + // Note: we intentionally use the number of etcd members to determine the etcd quorum because + // etcd members might not match with machines, e.g. while provisioning a new machine. etcdQuorum := (len(etcdMembers) / 2.0) + 1 k8sControlPlaneHealthy := 0 etcdMembersHealthy := 0 @@ -537,16 +539,16 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl if etcdIsManaged && etcdMembersHealthy < etcdQuorum { switch etcdMembersHealthy { case 0: - messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required", etcdQuorum)) + messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum)) case 1: - messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required", etcdQuorum)) + messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum)) default: - messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required", etcdMembersHealthy, etcdQuorum)) + messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum)) } } if k8sControlPlaneHealthy < 1 { - messages = append(messages, "There are no healthy control plane instances, at least 1 required") + messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required") } v1beta2conditions.Set(kcp, metav1.Condition{ diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index 02c60e251610..c35c2de697e5 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -608,8 +608,8 @@ func Test_setAvailableCondition(t *testing.T) { apiServerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} apiServerPodNotHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, Status: metav1.ConditionFalse} controllerManagerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} - schedulerPodManagerHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} - etcdPodManagerHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + schedulerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + etcdPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} etcdMemberHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionTrue} etcdMemberNotHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse} @@ -654,14 +654,13 @@ func Test_setAvailableCondition(t *testing.T) { }, Status: controlplanev1.KubeadmControlPlaneStatus{Initialized: true}, }, - EtcdMembers: []*etcd.Member{}, - EtcdMembersAgreeOnMemberList: false, + EtcdMembers: nil, }, expectCondition: metav1.Condition{ Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, - Status: metav1.ConditionFalse, - Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "At least one etcd member reports a list of etcd members different than the list reported by other members", + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason, + Message: "Failed to get etcd members", }, }, { @@ -712,7 +711,7 @@ func Test_setAvailableCondition(t *testing.T) { }, }, { - name: "Etcd members do not agree on cluster ID", + name: "Etcd members and machines list do not match", controlPlane: &internal.ControlPlane{ KCP: &controlplanev1.KubeadmControlPlane{ Spec: controlplanev1.KubeadmControlPlaneSpec{ @@ -748,7 +747,7 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, ), EtcdMembers: []*etcd.Member{}, EtcdMembersAgreeOnMemberList: true, @@ -773,9 +772,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberNotHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, ), EtcdMembers: []*etcd.Member{{}, {}, {}}, EtcdMembersAgreeOnMemberList: true, @@ -800,9 +799,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, ), EtcdMembers: []*etcd.Member{{}, {}, {}}, EtcdMembersAgreeOnMemberList: true, @@ -830,7 +829,7 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, ), EtcdMembers: []*etcd.Member{}, EtcdMembersAgreeOnMemberList: true, @@ -856,7 +855,7 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, ), EtcdMembers: []*etcd.Member{}, EtcdMembersAgreeOnMemberList: true, @@ -882,9 +881,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberNotHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberNotHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, ), EtcdMembers: []*etcd.Member{{}, {}, {}}, EtcdMembersAgreeOnMemberList: true, @@ -895,7 +894,7 @@ func Test_setAvailableCondition(t *testing.T) { Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "There is 1 healthy etcd member, at least 2 required", + Message: "There is 1 healthy etcd member, at least 2 required for etcd quorum", }, }, { @@ -910,9 +909,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy, etcdPodManagerHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, ), EtcdMembers: []*etcd.Member{{}, {}, {}}, EtcdMembersAgreeOnMemberList: true, @@ -923,7 +922,7 @@ func Test_setAvailableCondition(t *testing.T) { Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "There are no healthy control plane instances, at least 1 required", + Message: "There are no Machines with healthy control plane components, at least 1 required", }, }, { @@ -945,9 +944,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, ), EtcdMembers: nil, EtcdMembersAgreeOnMemberList: false, @@ -979,9 +978,9 @@ func Test_setAvailableCondition(t *testing.T) { }, }, Machines: collections.FromMachines( - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, - &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodManagerHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, ), EtcdMembers: nil, EtcdMembersAgreeOnMemberList: false, @@ -992,7 +991,7 @@ func Test_setAvailableCondition(t *testing.T) { Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, - Message: "There are no healthy control plane instances, at least 1 required", + Message: "There are no Machines with healthy control plane components, at least 1 required", }, }, } diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions.go b/controlplane/kubeadm/internal/workload_cluster_conditions.go index c255f3e6d6a7..0de593987725 100644 --- a/controlplane/kubeadm/internal/workload_cluster_conditions.go +++ b/controlplane/kubeadm/internal/workload_cluster_conditions.go @@ -139,8 +139,8 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name) if err != nil { - // Note. even if we fail reading the member list from one node/etcd members we don not turn EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false - // (those info are computed on what we know, so we can reason about availability eve if with a certain degree of problems in the cluster). + // Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false + // (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster). continue } @@ -231,12 +231,17 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane } // Make sure that the list of etcd members and machines is consistent. - // NOTE: we always compare members and machines with what we know, but we surface membersAndMachinesAreMatching - // for the Available condition only if all the etcd members agree on member list and cluster id. - membersAndMachinesAreMatching := compareMachinesAndMembers(controlPlane, controlPlaneNodes, controlPlane.EtcdMembers, &kcpErrors) + // NOTE: Members/Machines consistency is computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP + // control plane one etcd member is down, the comparison is based on the answer collected from two members only). + // NOTE: We surface the result of compareMachinesAndMembers for the Available condition only if all the etcd members agree + // on member list and cluster id (if not, we consider the list of members not reliable). + membersAndMachinesAreMatching, membersAndMachinesCompareErrors := compareMachinesAndMembers(controlPlane, controlPlaneNodes, controlPlane.EtcdMembers) if controlPlane.EtcdMembersAgreeOnMemberList && controlPlane.EtcdMembersAgreeOnClusterID { controlPlane.EtcdMembersAndMachinesAreMatching = membersAndMachinesAreMatching + } else { + controlPlane.EtcdMembersAndMachinesAreMatching = false } + kcpErrors = append(kcpErrors, membersAndMachinesCompareErrors...) // Aggregate components error from machines at KCP level aggregateConditionsFromMachinesToKCP(aggregateConditionsFromMachinesToKCPInput{ @@ -309,8 +314,9 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1 return currentMembers, nil } -func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeList, members []*etcd.Member, kcpErrors *[]string) bool { +func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeList, members []*etcd.Member) (bool, []string) { membersAndMachinesAreMatching := true + var kcpErrors []string // If it failed to get members, consider the check failed in case there is at least a machine already provisioned // (tolerate if we fail getting members when the cluster is provisioning the first machine). @@ -318,7 +324,7 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeLis if len(controlPlane.Machines.Filter(collections.HasNode())) > 0 { membersAndMachinesAreMatching = false } - return membersAndMachinesAreMatching + return membersAndMachinesAreMatching, nil } // Check Machine -> Etcd member. @@ -345,20 +351,20 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeLis Message: fmt.Sprintf("Etcd doesn't have an etcd member for Node %s", machine.Status.NodeRef.Name), }) - // Instead, surface there is a machine without etcd member on kcp for the Available condition + // Instead, surface there is a machine without etcd member on kcp's' Available condition // only if the machine is not deleting and the node exists by more than two minutes - // (this prevents the condition flick during scale up operations). + // (this prevents the condition to flick during scale up operations). // Note: Two minutes is an arbitrary interval where we expect the system to detect the new etcd member on the machine. if machine.DeletionTimestamp.IsZero() { - newNode := false + oldNode := true if nodes != nil { for _, node := range nodes.Items { - if machine.Status.NodeRef.Name == node.Name && time.Since(node.CreationTimestamp.Time) < 2*time.Minute { - newNode = true + if machine.Status.NodeRef.Name == node.Name && time.Since(node.CreationTimestamp.Time) <= 2*time.Minute { + oldNode = false } } } - if !newNode { + if oldNode { membersAndMachinesAreMatching = false } } @@ -385,18 +391,16 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeLis if name == "" { name = fmt.Sprintf("%d (Name not yet assigned)", member.ID) } - if kcpErrors != nil { - *kcpErrors = append(*kcpErrors, fmt.Sprintf("Etcd member %s does not have a corresponding Machine", name)) - } + kcpErrors = append(kcpErrors, fmt.Sprintf("Etcd member %s does not have a corresponding Machine", name)) - // Instead, surface there is an etcd member without a machine on kcp for the Available condition - // only if there are no provisioning machines (this prevents the condition flick during scale up operations). + // Instead, surface there is an etcd member without a machine on kcp's Available condition + // only if there are no provisioning machines (this prevents the condition to flick during scale up operations). if !hasProvisioningMachine { membersAndMachinesAreMatching = false } } } - return membersAndMachinesAreMatching + return membersAndMachinesAreMatching, kcpErrors } // UpdateStaticPodConditions is responsible for updating machine conditions reflecting the status of all the control plane diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions_test.go b/controlplane/kubeadm/internal/workload_cluster_conditions_test.go index 3a35042c0db5..d9ff6b6a676e 100644 --- a/controlplane/kubeadm/internal/workload_cluster_conditions_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_conditions_test.go @@ -1819,10 +1819,10 @@ func TestCompareMachinesAndMembers(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { g := NewWithT(t) - got := compareMachinesAndMembers(tt.controlPlane, tt.nodes, tt.members, &tt.kcpErrors) + got, gotErrors := compareMachinesAndMembers(tt.controlPlane, tt.nodes, tt.members) g.Expect(got).To(Equal(tt.expectMembersAndMachinesAreMatching)) - g.Expect(tt.kcpErrors).To(Equal(tt.expectKCPErrors)) + g.Expect(gotErrors).To(Equal(tt.expectKCPErrors)) }) } }