Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add v1beta2 available condition to KCP #11383

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,32 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneAvailableV1Beta2Condition is True if the control plane can be reached, EtcdClusterHealthy is true,
// and CertificatesAvailable is true.
// KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane is not deleted, `CertificatesAvailable` is true,
// at least one Machine with healthy control plane components, and etcd has enough operational members to meet quorum requirements.
// More specifically, considering how kubeadm layouts components:
// - Kubernetes API server, scheduler and controller manager health is inferred by the status of
// the corresponding Pods hosted on each machine.
// - In case of managed etcd, also a healthy etcd Pod and a healthy etcd member must exist on the same
// machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control
// plane cannot be considered operational (if etcd is not operational on a machine, most likely also API server,
// scheduler and controller manager on the same machine will be impacted).
// - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped.
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition

// KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the
// etcd cluster hosted on KubeadmControlPlane controlled machines.
KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason

// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available.
KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason

// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available.
KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason
)

// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept
// KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept
// requests. This information is usually used as a signal for starting all the provisioning operations that
// depend on a functional API server, but do not require a full HA control plane to exist.
KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized"
Expand Down
11 changes: 11 additions & 0 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controllers/external"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/failuredomains"
"sigs.k8s.io/cluster-api/util/patch"
Expand Down Expand Up @@ -58,6 +59,16 @@ type ControlPlane struct {
KubeadmConfigs map[string]*bootstrapv1.KubeadmConfig
InfraResources map[string]*unstructured.Unstructured

// EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below
// comes from the same func.
// NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP
// control plane one etcd member is down, those info are based on the answer collected from two members only).
// NOTE: Those info are specifically designed for computing KCP's Available condition.
EtcdMembers []*etcd.Member
EtcdMembersAgreeOnMemberList bool
EtcdMembersAgreeOnClusterID bool
EtcdMembersAndMachinesAreMatching bool

managementCluster ManagementCluster
workloadCluster WorkloadCluster

Expand Down
138 changes: 137 additions & 1 deletion controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
Expand Down Expand Up @@ -168,7 +169,7 @@ func (r *KubeadmControlPlaneReconciler) updateV1Beta2Status(ctx context.Context,
setMachinesUpToDateCondition(ctx, controlPlane.KCP, controlPlane.Machines)
setRemediatingCondition(ctx, controlPlane.KCP, controlPlane.MachinesToBeRemediatedByKCP(), controlPlane.UnhealthyMachines())
setDeletingCondition(ctx, controlPlane.KCP, controlPlane.DeletingReason, controlPlane.DeletingMessage)
// TODO: Available
setAvailableCondition(ctx, controlPlane.KCP, controlPlane.IsEtcdManaged(), controlPlane.EtcdMembers, controlPlane.EtcdMembersAgreeOnMemberList, controlPlane.EtcdMembersAgreeOnClusterID, controlPlane.EtcdMembersAndMachinesAreMatching, controlPlane.Machines)
}

func setReplicas(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, machines collections.Machines) {
Expand Down Expand Up @@ -441,6 +442,141 @@ func setDeletingCondition(_ context.Context, kcp *controlplanev1.KubeadmControlP
})
}

func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
if !kcp.Status.Initialized {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "Control plane not yet initialized",
})
return
}

if etcdIsManaged {
if etcdMembers == nil {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionUnknown,
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
Message: "Failed to get etcd members",
})
return
}

if !etcdMembersAgreeOnMemberList {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
})
return
}

if !etcdMembersAgreeOnClusterID {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
})
return
}

if !etcdMembersAndMachinesAreMatching {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "The list of etcd members does not match the list of Machines and Nodes",
})
return
}
}

// Determine control plane availability looking at machines conditions, which at this stage are
// already surfacing status from etcd member and all control plane pods hosted on every machine.
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
// etcd members might not match with machines, e.g. while provisioning a new machine.
etcdQuorum := (len(etcdMembers) / 2.0) + 1
k8sControlPlaneHealthy := 0
etcdMembersHealthy := 0
for _, machine := range machines {
// if external etcd, only look at the status of the K8s control plane components on this machine.
if !etcdIsManaged {
if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) {
k8sControlPlaneHealthy++
}
continue
}

// Otherwise, etcd is managed.
// In this case, when looking at the k8s control plane we should consider how kubeadm layouts control plane components,
// and more specifically:
// - API server on one machine only connect to the local etcd member
// - ControllerManager and scheduler on a machine connect to the local API server (not to the control plane endpoint)
// As a consequence, we consider the K8s control plane on this machine healthy only if everything is healthy.

if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) {
etcdMembersHealthy++
}

if v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineAPIServerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineControllerManagerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineSchedulerPodHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition) &&
v1beta2conditions.IsTrue(machine, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyV1Beta2Condition) {
k8sControlPlaneHealthy++
}
}

if kcp.DeletionTimestamp.IsZero() &&
(!etcdIsManaged || etcdMembersHealthy >= etcdQuorum) &&
k8sControlPlaneHealthy >= 1 &&
v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Reason,
})
return
}

messages := []string{}
if !kcp.DeletionTimestamp.IsZero() {
messages = append(messages, "Control plane metadata.deletionTimestamp is set")
}

if !v1beta2conditions.IsTrue(kcp, controlplanev1.KubeadmControlPlaneCertificatesAvailableV1Beta2Condition) {
messages = append(messages, "Control plane certificates are not available")
}

if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
switch etcdMembersHealthy {
case 0:
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
case 1:
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
default:
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum))
}
}

if k8sControlPlaneHealthy < 1 {
messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required")
}

v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: strings.Join(messages, ";"),
})
}

func aggregateStaleMachines(machines collections.Machines) string {
if len(machines) == 0 {
return ""
Expand Down
Loading