diff --git a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go index d3049763ae92..789cd25bb849 100644 --- a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go +++ b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go @@ -20,14 +20,32 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" // KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version. const ( - // KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true, - // and CertificatesAvailable is true. + // KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane is not deleted, `CertificatesAvailable` is true, + // at least one Machine with healthy control plane components, and etcd has enough operational members to meet quorum requirements. + // More specifically, considering how kubeadm layouts components: + // - Kubernetes API server, scheduler and controller manager health is inferred by the status of + // the corresponding Pods hosted on each machine. + // - In case of managed etcd, also a healthy etcd Pod and a healthy etcd member must exist on the same + // machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control + // plane cannot be considered operational (if etcd is not operational on a machine, most likely also API server, + // scheduler and controller manager on the same machine will be impacted). + // - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped. KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition + + // KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the + // etcd cluster hosted on KubeadmControlPlane controlled machines. + KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason + + // KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available. + KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason + + // KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available. + KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason ) // KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version. const ( - // KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept + // KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept // requests. This information is usually used as a signal for starting all the provisioning operations that // depend on a functional API server, but do not require a full HA control plane to exist. KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized" diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 7d624645305a..fa18158d94e9 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -30,6 +30,7 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controllers/external" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/failuredomains" "sigs.k8s.io/cluster-api/util/patch" @@ -58,6 +59,16 @@ type ControlPlane struct { KubeadmConfigs map[string]*bootstrapv1.KubeadmConfig InfraResources map[string]*unstructured.Unstructured + // EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below + // comes from the same func. + // NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP + // control plane one etcd member is down, those info are based on the answer collected from two members only). + // NOTE: Those info are specifically designed for computing KCP's Available condition. + EtcdMembers []*etcd.Member + EtcdMembersAgreeOnMemberList bool + EtcdMembersAgreeOnClusterID bool + EtcdMembersAndMachinesAreMatching bool + managementCluster ManagementCluster workloadCluster WorkloadCluster diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index 2327d19f2cc5..484d6bb90b5c 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -31,6 +31,7 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" @@ -168,7 +169,7 @@ func (r *KubeadmControlPlaneReconciler) updateV1Beta2Status(ctx context.Context, setMachinesUpToDateCondition(ctx, controlPlane.KCP, controlPlane.Machines) setRemediatingCondition(ctx, controlPlane.KCP, controlPlane.MachinesToBeRemediatedByKCP(), controlPlane.UnhealthyMachines()) setDeletingCondition(ctx, controlPlane.KCP, controlPlane.DeletingReason, controlPlane.DeletingMessage) - // TODO: Available + setAvailableCondition(ctx, controlPlane.KCP, controlPlane.IsEtcdManaged(), controlPlane.EtcdMembers, controlPlane.EtcdMembersAgreeOnMemberList, controlPlane.EtcdMembersAgreeOnClusterID, controlPlane.EtcdMembersAndMachinesAreMatching, controlPlane.Machines) } func setReplicas(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines) { @@ -441,6 +442,141 @@ func setDeletingCondition(_ context.Context, kcp *controlplanev1.KubeadmControlP }) } +func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) { + if !kcp.Status.Initialized { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "Control plane not yet initialized", + }) + return + } + + if etcdIsManaged { + if etcdMembers == nil { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason, + Message: "Failed to get etcd members", + }) + return + } + + if !etcdMembersAgreeOnMemberList { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a list of etcd members different than the list reported by other members", + }) + return + } + + if !etcdMembersAgreeOnClusterID { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members", + }) + return + } + + if !etcdMembersAndMachinesAreMatching { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "The list of etcd members does not match the list of Machines and Nodes", + }) + return + } + } + + // Determine control plane availability looking at machines conditions, which at this stage are + // already surfacing status from etcd member and all control plane pods hosted on every machine. + // Note: we intentionally use the number of etcd members to determine the etcd quorum because + // etcd members might not match with machines, e.g. while provisioning a new machine. + etcdQuorum := (len(etcdMembers) / 2.0) + 1 + k8sControlPlaneHealthy := 0 + etcdMembersHealthy := 0 + for _, machine := range machines { + // if external etcd, only look at the status of the K8s control plane components on this machine. + if !etcdIsManaged { + if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) { + k8sControlPlaneHealthy++ + } + continue + } + + // Otherwise, etcd is managed. + // In this case, when looking at the k8s control plane we should consider how kubeadm layouts control plane components, + // and more specifically: + // - API server on one machine only connect to the local etcd member + // - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint) + // As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy. + + if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) { + etcdMembersHealthy++ + } + + if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) && + v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) { + k8sControlPlaneHealthy++ + } + } + + if kcp.DeletionTimestamp.IsZero() && + (!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) && + k8sControlPlaneHealthy >= 1 && + v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) { + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason, + }) + return + } + + messages := []string{} + if !kcp.DeletionTimestamp.IsZero() { + messages = append(messages, "Control plane metadata.deletionTimestamp is set") + } + + if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) { + messages = append(messages, "Control plane certificates are not available") + } + + if etcdIsManaged && etcdMembersHealthy < etcdQuorum { + switch etcdMembersHealthy { + case 0: + messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum)) + case 1: + messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum)) + default: + messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum)) + } + } + + if k8sControlPlaneHealthy < 1 { + messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required") + } + + v1beta2conditions.Set(kcp, metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: strings.Join(messages, ";"), + }) +} + func aggregateStaleMachines(machines collections.Machines) string { if len(machines) == 0 { return "" diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index aee7b88a53d4..066baef46335 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -29,8 +29,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd" controlplanev1webhooks "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/webhooks" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" @@ -436,7 +438,7 @@ func Test_setScalingDownCondition(t *testing.T) { } } -func Test_setMachinesReadyAndMachinesUpToDate(t *testing.T) { +func Test_setMachinesReadyAndMachinesUpToDateConditions(t *testing.T) { readyTrue := metav1.Condition{Type: clusterv1.MachineReadyV1Beta2Condition, Status: metav1.ConditionTrue} readyFalse := metav1.Condition{Type: clusterv1.MachineReadyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: "SomeReason", Message: "NotReady"} @@ -656,6 +658,413 @@ func TestDeletingCondition(t *testing.T) { } } +func Test_setAvailableCondition(t *testing.T) { + certificatesReady := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition, Status: metav1.ConditionTrue} + certificatesNotReady := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition, Status: metav1.ConditionFalse} + + apiServerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + apiServerPodNotHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition, Status: metav1.ConditionFalse} + controllerManagerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + schedulerPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + etcdPodHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + + etcdMemberHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionTrue} + etcdMemberNotHealthy := metav1.Condition{Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse} + + tests := []struct { + name string + controlPlane *internal.ControlPlane + expectCondition metav1.Condition + }{ + { + name: "Kcp not yet initialized", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{Local: &bootstrapv1.LocalEtcd{}}, + }, + }, + }, + }, + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "Control plane not yet initialized", + }, + }, + { + name: "Failed to get etcd members", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{Local: &bootstrapv1.LocalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{Initialized: true}, + }, + EtcdMembers: nil, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionUnknown, + Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason, + Message: "Failed to get etcd members", + }, + }, + { + name: "Etcd members do not agree on member list", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{Local: &bootstrapv1.LocalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{Initialized: true}, + }, + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a list of etcd members different than the list reported by other members", + }, + }, + { + name: "Etcd members do not agree on cluster ID", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{Local: &bootstrapv1.LocalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{Initialized: true}, + }, + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members", + }, + }, + { + name: "Etcd members and machines list do not match", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{Local: &bootstrapv1.LocalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{Initialized: true}, + }, + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "The list of etcd members does not match the list of Machines and Nodes", + }, + }, + { + name: "KCP is available", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason, + }, + }, + { + name: "One not healthy etcd members, but within quorum", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{{}, {}, {}}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason, + }, + }, + { + name: "Two not healthy k8s control plane, but one working", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{{}, {}, {}}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason, + }, + }, + { + name: "KCP is deleting", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + DeletionTimestamp: ptr.To(metav1.Now()), + }, + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "Control plane metadata.deletionTimestamp is set", + }, + }, + { + name: "Certificates are not available", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesNotReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "Control plane certificates are not available", + }, + }, + { + name: "Not enough healthy etcd members", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberNotHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{{}, {}, {}}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "There is 1 healthy etcd member, at least 2 required for etcd quorum", + }, + }, + { + name: "Not enough healthy K8s control planes", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy, etcdPodHealthy, etcdMemberHealthy}}}}, + ), + EtcdMembers: []*etcd.Member{{}, {}, {}}, + EtcdMembersAgreeOnMemberList: true, + EtcdMembersAgreeOnClusterID: true, + EtcdMembersAndMachinesAreMatching: true, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "There are no Machines with healthy control plane components, at least 1 required", + }, + }, + { + name: "External etcd, at least one K8s control plane", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{External: &bootstrapv1.ExternalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + ), + EtcdMembers: nil, + EtcdMembersAgreeOnMemberList: false, + EtcdMembersAgreeOnClusterID: false, + EtcdMembersAndMachinesAreMatching: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason, + }, + }, + { + name: "External etcd, not enough healthy K8s control planes", + controlPlane: &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + KubeadmConfigSpec: bootstrapv1.KubeadmConfigSpec{ + ClusterConfiguration: &bootstrapv1.ClusterConfiguration{ + Etcd: bootstrapv1.Etcd{External: &bootstrapv1.ExternalEtcd{}}, + }, + }, + }, + Status: controlplanev1.KubeadmControlPlaneStatus{ + Initialized: true, + V1Beta2: &controlplanev1.KubeadmControlPlaneV1Beta2Status{ + Conditions: []metav1.Condition{certificatesReady}, + }, + }, + }, + Machines: collections.FromMachines( + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m1"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m2"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "m3"}, Status: clusterv1.MachineStatus{V1Beta2: &clusterv1.MachineV1Beta2Status{Conditions: []metav1.Condition{apiServerPodNotHealthy, controllerManagerPodHealthy, schedulerPodHealthy}}}}, + ), + EtcdMembers: nil, + EtcdMembersAgreeOnMemberList: false, + EtcdMembersAgreeOnClusterID: false, + EtcdMembersAndMachinesAreMatching: false, + }, + expectCondition: metav1.Condition{ + Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason, + Message: "There are no Machines with healthy control plane components, at least 1 required", + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + setAvailableCondition(ctx, tt.controlPlane.KCP, tt.controlPlane.IsEtcdManaged(), tt.controlPlane.EtcdMembers, tt.controlPlane.EtcdMembersAgreeOnMemberList, tt.controlPlane.EtcdMembersAgreeOnClusterID, tt.controlPlane.EtcdMembersAndMachinesAreMatching, tt.controlPlane.Machines) + + availableCondition := v1beta2conditions.Get(tt.controlPlane.KCP, controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition) + g.Expect(availableCondition).ToNot(BeNil()) + g.Expect(*availableCondition).To(v1beta2conditions.MatchCondition(tt.expectCondition, v1beta2conditions.IgnoreLastTransitionTime(true))) + }) + } +} + func TestKubeadmControlPlaneReconciler_updateStatusNoMachines(t *testing.T) { g := NewWithT(t) diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions.go b/controlplane/kubeadm/internal/workload_cluster_conditions.go index fabc32f0a35d..8f3910a82aae 100644 --- a/controlplane/kubeadm/internal/workload_cluster_conditions.go +++ b/controlplane/kubeadm/internal/workload_cluster_conditions.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strings" + "time" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -63,7 +64,6 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { // NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd // as ultimate source of truth for the list of members and for their health. - // TODO: Integrate this with clustercache / handle the grace period controlPlaneNodes, err := w.getControlPlaneNodes(ctx) if err != nil { for _, m := range controlPlane.Machines { @@ -94,8 +94,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane kcpErrors []string // clusterID is used to store and compare the etcd's cluster id. clusterID *uint64 - // members is used to store the list of etcd members and compare with all the other nodes in the cluster. - members []*etcd.Member ) provisioningMachines := controlPlane.Machines.Filter(collections.Not(collections.HasNode())) @@ -141,22 +139,29 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name) if err != nil { + // Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false + // (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster). continue } // Check if the list of members IDs reported is the same as all other members. // NOTE: the first member reporting this information is the baseline for this information. - if members == nil { - members = currentMembers + // Also, if this is the first node we are reading from let's + // assume all the members agree on member list and cluster id. + if controlPlane.EtcdMembers == nil { + controlPlane.EtcdMembers = currentMembers + controlPlane.EtcdMembersAgreeOnMemberList = true + controlPlane.EtcdMembersAgreeOnClusterID = true } - if !etcdutil.MemberEqual(members, currentMembers) { - conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports the cluster is composed by members %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(members)) + if !etcdutil.MemberEqual(controlPlane.EtcdMembers, currentMembers) { + controlPlane.EtcdMembersAgreeOnMemberList = false + conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports the cluster is composed by members %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)) v1beta2conditions.Set(machine, metav1.Condition{ Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, - Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(members)), + Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)), }) continue } @@ -204,6 +209,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane clusterID = &member.ClusterID } if *clusterID != member.ClusterID { + controlPlane.EtcdMembersAgreeOnClusterID = false conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member has cluster ID %d, but all previously seen etcd members have cluster ID %d", member.ClusterID, *clusterID) v1beta2conditions.Set(machine, metav1.Condition{ @@ -225,7 +231,17 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane } // Make sure that the list of etcd members and machines is consistent. - kcpErrors = compareMachinesAndMembers(controlPlane, members, kcpErrors) + // NOTE: Members/Machines consistency is computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP + // control plane one etcd member is down, the comparison is based on the answer collected from two members only). + // NOTE: We surface the result of compareMachinesAndMembers for the Available condition only if all the etcd members agree + // on member list and cluster id (if not, we consider the list of members not reliable). + membersAndMachinesAreMatching, membersAndMachinesCompareErrors := compareMachinesAndMembers(controlPlane, controlPlaneNodes, controlPlane.EtcdMembers) + if controlPlane.EtcdMembersAgreeOnMemberList && controlPlane.EtcdMembersAgreeOnClusterID { + controlPlane.EtcdMembersAndMachinesAreMatching = membersAndMachinesAreMatching + } else { + controlPlane.EtcdMembersAndMachinesAreMatching = false + } + kcpErrors = append(kcpErrors, membersAndMachinesCompareErrors...) // Aggregate components error from machines at KCP level aggregateConditionsFromMachinesToKCP(aggregateConditionsFromMachinesToKCPInput{ @@ -298,11 +314,17 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1 return currentMembers, nil } -func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Member, kcpErrors []string) []string { - // NOTE: We run this check only if we actually know the list of members, otherwise the first for loop - // could generate a false negative when reporting missing etcd members. +func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeList, members []*etcd.Member) (bool, []string) { + membersAndMachinesAreMatching := true + var kcpErrors []string + + // If it failed to get members, consider the check failed in case there is at least a machine already provisioned + // (tolerate if we fail getting members when the cluster is provisioning the first machine). if members == nil { - return kcpErrors + if len(controlPlane.Machines.Filter(collections.HasNode())) > 0 { + membersAndMachinesAreMatching = false + } + return membersAndMachinesAreMatching, nil } // Check Machine -> Etcd member. @@ -318,6 +340,8 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Membe } } if !found { + // Surface there is a machine without etcd member on machine's EtcdMemberHealthy condition. + // The same info will also surface into the EtcdClusterHealthy condition on kcp. conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Missing etcd member") v1beta2conditions.Set(machine, metav1.Condition{ @@ -326,27 +350,57 @@ func compareMachinesAndMembers(controlPlane *ControlPlane, members []*etcd.Membe Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: fmt.Sprintf("Etcd doesn't have an etcd member for Node %s", machine.Status.NodeRef.Name), }) + + // Instead, surface there is a machine without etcd member on kcp's' Available condition + // only if the machine is not deleting and the node exists by more than two minutes + // (this prevents the condition to flick during scale up operations). + // Note: Two minutes is the time after which we expect the system to detect the new etcd member on the machine. + if machine.DeletionTimestamp.IsZero() { + oldNode := false + if nodes != nil { + for _, node := range nodes.Items { + if machine.Status.NodeRef.Name == node.Name && time.Since(node.CreationTimestamp.Time) > 2*time.Minute { + oldNode = true + } + } + } + if oldNode { + membersAndMachinesAreMatching = false + } + } } } // Check Etcd member -> Machine. for _, member := range members { found := false + hasProvisioningMachine := false for _, machine := range controlPlane.Machines { - if machine.Status.NodeRef != nil && machine.Status.NodeRef.Name == member.Name { + if machine.Status.NodeRef == nil { + hasProvisioningMachine = true + continue + } + if machine.Status.NodeRef.Name == member.Name { found = true break } } if !found { + // Surface there is an etcd member without a machine into the EtcdClusterHealthy condition on kcp. name := member.Name if name == "" { name = fmt.Sprintf("%d (Name not yet assigned)", member.ID) } kcpErrors = append(kcpErrors, fmt.Sprintf("Etcd member %s does not have a corresponding Machine", name)) + + // Instead, surface there is an etcd member without a machine on kcp's Available condition + // only if there are no provisioning machines (this prevents the condition to flick during scale up operations). + if !hasProvisioningMachine { + membersAndMachinesAreMatching = false + } } } - return kcpErrors + return membersAndMachinesAreMatching, kcpErrors } // UpdateStaticPodConditions is responsible for updating machine conditions reflecting the status of all the control plane @@ -372,7 +426,6 @@ func (w *Workload) UpdateStaticPodConditions(ctx context.Context, controlPlane * } // NOTE: this fun uses control plane nodes from the workload cluster as a source of truth for the current state. - // TODO: integrate this with clustercache / handle the grace period controlPlaneNodes, err := w.getControlPlaneNodes(ctx) if err != nil { for i := range controlPlane.Machines { diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions_test.go b/controlplane/kubeadm/internal/workload_cluster_conditions_test.go index e79c538089f2..df92a36891c9 100644 --- a/controlplane/kubeadm/internal/workload_cluster_conditions_test.go +++ b/controlplane/kubeadm/internal/workload_cluster_conditions_test.go @@ -19,6 +19,7 @@ package internal import ( "fmt" "testing" + "time" . "github.com/onsi/gomega" "github.com/pkg/errors" @@ -42,15 +43,19 @@ import ( func TestUpdateEtcdConditions(t *testing.T) { tests := []struct { - name string - kcp *controlplanev1.KubeadmControlPlane - machines []*clusterv1.Machine - injectClient client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error. - injectEtcdClientGenerator etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error. - expectedKCPCondition *clusterv1.Condition - expectedKCPV1Beta2Condition *metav1.Condition - expectedMachineConditions map[string]clusterv1.Conditions - expectedMachineV1Beta2Conditions map[string][]metav1.Condition + name string + kcp *controlplanev1.KubeadmControlPlane + machines []*clusterv1.Machine + injectClient client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error. + injectEtcdClientGenerator etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error. + expectedKCPCondition *clusterv1.Condition + expectedKCPV1Beta2Condition *metav1.Condition + expectedMachineConditions map[string]clusterv1.Conditions + expectedMachineV1Beta2Conditions map[string][]metav1.Condition + expectedEtcdMembers []string + expectedEtcdMembersAgreeOnMemberList bool + expectedEtcdMembersAgreeOnClusterID bool + expectedEtcdMembersAndMachinesAreMatching bool }{ { name: "if list nodes return an error should report all the conditions Unknown", @@ -77,6 +82,9 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, Message: "Failed to get the Node hosting the etcd member"}, }, }, + expectedEtcdMembersAgreeOnMemberList: false, // without reading nodes, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // without reading nodes, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // without reading nodes, we can not make assumptions. }, { name: "If there are provisioning machines, a node without machine should be ignored in v1beta1, reported in v1beta2", @@ -103,6 +111,9 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, Message: "Node does not exist"}, }, }, + expectedEtcdMembersAgreeOnMemberList: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // without reading members, we can not make assumptions. }, { name: "If there are no provisioning machines, a node without machine should be reported as False condition at KCP level", @@ -119,6 +130,9 @@ func TestUpdateEtcdConditions(t *testing.T) { Reason: controlplanev1.KubeadmControlPlaneEtcdClusterNotHealthyV1Beta2Reason, Message: "Control plane Node n1 does not have a corresponding Machine", }, + expectedEtcdMembersAgreeOnMemberList: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // without reading members, we can not make assumptions. }, { name: "failure creating the etcd client should report unknown condition", @@ -150,6 +164,9 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, Message: "Failed to connect to the etcd Pod on the n1 Node: failed to get client for node"}, }, }, + expectedEtcdMembersAgreeOnMemberList: false, // failure in reading members, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // failure in reading members, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // failure in reading members, we can not make assumptions. }, { name: "etcd client reporting status errors should be reflected into a false condition", @@ -186,6 +203,9 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: "Etcd reports errors: some errors"}, }, }, + expectedEtcdMembersAgreeOnMemberList: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // without reading members, we can not make assumptions. }, { name: "failure listing members should report false condition in v1beta1, unknown in v1beta2", @@ -222,6 +242,9 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason, Message: "Failed to get answer from the etcd member on the n1 Node: failed to get list of members for etcd cluster: failed to list members"}, }, }, + expectedEtcdMembersAgreeOnMemberList: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAgreeOnClusterID: false, // without reading members, we can not make assumptions. + expectedEtcdMembersAndMachinesAreMatching: false, // without reading members, we can not make assumptions. }, { name: "an etcd member with alarms should report false condition", @@ -267,6 +290,10 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: "Etcd reports alarms: NOSPACE"}, }, }, + expectedEtcdMembers: []string{"n1"}, + expectedEtcdMembersAgreeOnMemberList: true, + expectedEtcdMembersAgreeOnClusterID: true, + expectedEtcdMembersAndMachinesAreMatching: true, }, { name: "etcd members with different Cluster ID should report false condition", @@ -349,6 +376,10 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: "Etcd member has cluster ID 2, but all previously seen etcd members have cluster ID 1"}, }, }, + expectedEtcdMembers: []string{"n1", "n2"}, + expectedEtcdMembersAgreeOnMemberList: true, + expectedEtcdMembersAgreeOnClusterID: false, + expectedEtcdMembersAndMachinesAreMatching: false, }, { name: "etcd members with different member list should report false condition", @@ -431,6 +462,10 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: "The etcd member hosted on this Machine reports the cluster is composed by [n2 n3], but all previously seen etcd members are reporting [n1 n2]"}, }, }, + expectedEtcdMembers: []string{"n1", "n2"}, + expectedEtcdMembersAgreeOnMemberList: false, + expectedEtcdMembersAgreeOnClusterID: true, + expectedEtcdMembersAndMachinesAreMatching: false, }, { name: "a machine without a member should report false condition", @@ -495,6 +530,10 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason, Message: "Etcd doesn't have an etcd member for Node n2"}, }, }, + expectedEtcdMembers: []string{"n1"}, + expectedEtcdMembersAgreeOnMemberList: true, + expectedEtcdMembersAgreeOnClusterID: true, + expectedEtcdMembersAndMachinesAreMatching: false, }, { name: "healthy etcd members should report true", @@ -576,6 +615,10 @@ func TestUpdateEtcdConditions(t *testing.T) { {Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition, Status: metav1.ConditionTrue, Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Reason, Message: ""}, }, }, + expectedEtcdMembers: []string{"n1", "n2"}, + expectedEtcdMembersAgreeOnMemberList: true, + expectedEtcdMembersAgreeOnClusterID: true, + expectedEtcdMembersAndMachinesAreMatching: true, }, { name: "External etcd should set a condition at KCP level for v1beta1, not for v1beta2", @@ -590,8 +633,11 @@ func TestUpdateEtcdConditions(t *testing.T) { }, }, }, - expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition), - expectedKCPV1Beta2Condition: nil, + expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition), + expectedKCPV1Beta2Condition: nil, + expectedEtcdMembersAgreeOnMemberList: false, + expectedEtcdMembersAgreeOnClusterID: false, + expectedEtcdMembersAndMachinesAreMatching: false, }, } for _, tt := range tests { @@ -623,6 +669,16 @@ func TestUpdateEtcdConditions(t *testing.T) { g.Expect(m.GetConditions()).To(conditions.MatchConditions(tt.expectedMachineConditions[m.Name]), "unexpected conditions for Machine %s", m.Name) g.Expect(m.GetV1Beta2Conditions()).To(v1beta2conditions.MatchConditions(tt.expectedMachineV1Beta2Conditions[m.Name], v1beta2conditions.IgnoreLastTransitionTime(true)), "unexpected conditions for Machine %s", m.Name) } + + g.Expect(controlPane.EtcdMembersAgreeOnMemberList).To(Equal(tt.expectedEtcdMembersAgreeOnMemberList), "EtcdMembersAgreeOnMemberList does not match") + g.Expect(controlPane.EtcdMembersAgreeOnClusterID).To(Equal(tt.expectedEtcdMembersAgreeOnClusterID), "EtcdMembersAgreeOnClusterID does not match") + g.Expect(controlPane.EtcdMembersAndMachinesAreMatching).To(Equal(tt.expectedEtcdMembersAndMachinesAreMatching), "EtcdMembersAndMachinesAreMatching does not match") + + var membersNames []string + for _, m := range controlPane.EtcdMembers { + membersNames = append(membersNames, m.Name) + } + g.Expect(membersNames).To(Equal(tt.expectedEtcdMembers)) }) } } @@ -1620,3 +1676,180 @@ func TestAggregateV1Beta2ConditionsFromMachinesToKCP(t *testing.T) { }) } } + +func TestCompareMachinesAndMembers(t *testing.T) { + tests := []struct { + name string + controlPlane *ControlPlane + nodes *corev1.NodeList + members []*etcd.Member + expectMembersAndMachinesAreMatching bool + expectKCPErrors []string + }{ + { + name: "true if the list of members is empty and there are no provisioned machines", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines(fakeMachine("m1")), + }, + members: nil, + nodes: nil, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: nil, + }, + { + name: "false if the list of members is empty and there are provisioned machines", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines(fakeMachine("m1", withNodeRef("m1"))), + }, + members: nil, + nodes: nil, + expectMembersAndMachinesAreMatching: false, + expectKCPErrors: nil, + }, + { + name: "true if the list of members match machines", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2", withNodeRef("m2")), + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + }, + nodes: nil, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: nil, + }, + { + name: "true if there is a machine without a member but at least a machine is still provisioning", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2", withNodeRef("m2")), + fakeMachine("m3"), // m3 is still provisioning + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + // m3 is missing + }, + nodes: nil, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: nil, + }, + { + name: "true if there is a machine without a member but node on this machine does not exist yet", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2", withNodeRef("m2")), + fakeMachine("m3", withNodeRef("m3")), + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + // m3 is missing + }, + nodes: &corev1.NodeList{Items: []corev1.Node{ + // m3 is missing + }}, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: nil, + }, + { + name: "true if there is a machine without a member but node on this machine has been just created", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2", withNodeRef("m2")), + fakeMachine("m3", withNodeRef("m3")), + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + // m3 is missing + }, + nodes: &corev1.NodeList{Items: []corev1.Node{ + {ObjectMeta: metav1.ObjectMeta{Name: "m3", CreationTimestamp: metav1.Time{Time: time.Now().Add(-110 * time.Second)}}}, // m3 is just provisioned + }}, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: nil, + }, + { + name: "false if there is a machine without a member and node on this machine is old", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2", withNodeRef("m2")), + fakeMachine("m3", withNodeRef("m3")), + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + // m3 is missing + }, + nodes: &corev1.NodeList{Items: []corev1.Node{ + {ObjectMeta: metav1.ObjectMeta{Name: "m3", CreationTimestamp: metav1.Time{Time: time.Now().Add(-10 * time.Minute)}}}, // m3 is old + }}, + expectMembersAndMachinesAreMatching: false, + expectKCPErrors: nil, + }, + { + name: "false if there is a member without a machine", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + // m2 is missing + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + }, + nodes: nil, + expectMembersAndMachinesAreMatching: false, + expectKCPErrors: []string{"Etcd member m2 does not have a corresponding Machine"}, + }, + { + name: "true if there is a member without a machine while a machine is still provisioning ", + controlPlane: &ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{}, + Machines: collections.FromMachines( + fakeMachine("m1", withNodeRef("m1")), + fakeMachine("m2"), // m2 still provisioning + ), + }, + members: []*etcd.Member{ + {Name: "m1"}, + {Name: "m2"}, + }, + nodes: nil, + expectMembersAndMachinesAreMatching: true, + expectKCPErrors: []string{"Etcd member m2 does not have a corresponding Machine"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + got, gotErrors := compareMachinesAndMembers(tt.controlPlane, tt.nodes, tt.members) + + g.Expect(got).To(Equal(tt.expectMembersAndMachinesAreMatching)) + g.Expect(gotErrors).To(Equal(tt.expectKCPErrors)) + }) + } +}