From ebb95c0d2dd05c3a90736cd7eb5f24b1dc43bf4a Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Fri, 3 Dec 2021 17:07:18 -0400 Subject: [PATCH 1/8] Add read-only state to pod facts --- pkg/controllers/podfacts.go | 70 ++++++++++++++++++++++++++++++-- pkg/controllers/podfacts_test.go | 50 +++++++++++++++++++++-- pkg/version/version.go | 2 + 3 files changed, 115 insertions(+), 7 deletions(-) diff --git a/pkg/controllers/podfacts.go b/pkg/controllers/podfacts.go index fadeb7f0b..7e02979d9 100644 --- a/pkg/controllers/podfacts.go +++ b/pkg/controllers/podfacts.go @@ -26,6 +26,7 @@ import ( "github.com/vertica/vertica-kubernetes/pkg/cmds" "github.com/vertica/vertica-kubernetes/pkg/names" "github.com/vertica/vertica-kubernetes/pkg/paths" + "github.com/vertica/vertica-kubernetes/pkg/version" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -76,6 +77,9 @@ type PodFact struct { // port 5433. upNode bool + // true means the node is up, but in read-only state + readOnly bool + // The vnode name that Vertica assigned to this pod. vnodeName string @@ -197,7 +201,7 @@ func (p *PodFacts) collectPodByStsIndex(ctx context.Context, vdb *vapi.VerticaDB fns := []func(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error{ p.checkIsInstalled, p.checkIsDBCreated, - p.checkIfNodeIsUp, + p.checkIfNodeIsUpAndReadOnly, p.checkEulaAcceptance, p.checkLogrotateExists, p.checkIsLogrotateWritable, @@ -333,13 +337,30 @@ func (p *PodFacts) checkIsDBCreated(ctx context.Context, vdb *vapi.VerticaDB, pf return nil } -// checkIfNodeIsUp will determine whether Vertica process is running in the pod. -func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error { +// checkIfNodeIsUpAndReadOnly will determine whether Vertica process is running +// in the pod and whether it is in read-only mode. +func (p *PodFacts) checkIfNodeIsUpAndReadOnly(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error { if pf.dbExists.IsFalse() || !pf.isPodRunning { pf.upNode = false + pf.readOnly = false return nil } + // The read-only state is a new state added in 11.0.2. So we can only query + // for it on levels 11.0.2+. Otherwise, we always treat read-only as being + // disabled. + vinf, ok := version.MakeInfo(vdb) + if ok && vinf.IsEqualOrNewer(version.NodesHaveReadOnlyStateVersion) { + return p.queryNodeStatus(ctx, pf) + } + return p.checkIfNodeIsUp(ctx, pf) +} + +// checkIfNodeIsUp will check if the Vertica is up and running in this process. +// It assumes the pod is running and the database exists. It doesn't check for +// read-only state. This exists for backwards compatibility of versions older +// than 11.0.2. +func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, pf *PodFact) error { cmd := []string{ "-c", "select 1", @@ -349,13 +370,56 @@ func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, vdb *vapi.VerticaDB, pf return err } pf.upNode = false + pf.readOnly = false } else { pf.upNode = true + pf.readOnly = false + } + + return nil +} + +// queryNodeStatus will query the nodes system table to see if the node is up +// and wether it is in read-only state. It assumes the database exists and the +// pod is running. +func (p *PodFacts) queryNodeStatus(ctx context.Context, pf *PodFact) error { + cmd := []string{ + "-tAc", + "select node_state, is_readonly " + + "from sessions as a, nodes as b " + + "where a.session_id = current_session() and a.node_name = b.node_name", + } + if stdout, stderr, err := p.PRunner.ExecVSQL(ctx, pf.name, names.ServerContainer, cmd...); err != nil { + if !strings.Contains(stderr, "vsql: could not connect to server:") { + return err + } + pf.upNode = false + pf.readOnly = false + } else if pf.upNode, pf.readOnly, err = parseNodeStateAndReadOnly(stdout); err != nil { + return err } return nil } +// parseNodeStateAndReadOnly will parse query output to determine if a node is +// up and read-only. +func parseNodeStateAndReadOnly(stdout string) (upNode, readOnly bool, err error) { + // The stdout comes in the form like this: + // UP|t + // This means upNode is true and readOnly is true. + lines := strings.Split(stdout, "\n") + cols := strings.Split(lines[0], "|") + const ExpectedCols = 2 + if len(cols) != ExpectedCols { + err = fmt.Errorf("expected %d columns from node query but only got %d", ExpectedCols, len(cols)) + return + } + upNode = cols[0] == "UP" + readOnly = cols[1] == "t" + return +} + // parseVerticaNodeName extract the vertica node name from the directory list func parseVerticaNodeName(stdout string) string { re := regexp.MustCompile(`(v_.+_node\d+)_data`) diff --git a/pkg/controllers/podfacts_test.go b/pkg/controllers/podfacts_test.go index 23148b0b0..a454ad86e 100644 --- a/pkg/controllers/podfacts_test.go +++ b/pkg/controllers/podfacts_test.go @@ -24,6 +24,7 @@ import ( vapi "github.com/vertica/vertica-kubernetes/api/v1beta1" "github.com/vertica/vertica-kubernetes/pkg/cmds" "github.com/vertica/vertica-kubernetes/pkg/names" + "github.com/vertica/vertica-kubernetes/pkg/version" appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/types" "yunion.io/x/pkg/tristate" @@ -182,7 +183,7 @@ var _ = Describe("podfacts", func() { } pfs := MakePodFacts(k8sClient, fpr) pf := &PodFact{name: pn, isPodRunning: true} - Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).Should(Succeed()) + Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed()) Expect(pf.upNode).Should(BeFalse()) }) @@ -198,11 +199,11 @@ var _ = Describe("podfacts", func() { } pfs := MakePodFacts(k8sClient, fpr) pf := &PodFact{name: pn, isPodRunning: true} - Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).Should(Succeed()) + Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed()) Expect(pf.upNode).Should(BeTrue()) }) - It("should fail if checkIfNodeIsUp gets an unexpected error", func() { + It("should fail if checkIfNodeIsUpAndReadOnly gets an unexpected error", func() { vdb := vapi.MakeVDB() pn := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0) fpr := &cmds.FakePodRunner{ @@ -214,7 +215,30 @@ var _ = Describe("podfacts", func() { } pfs := MakePodFacts(k8sClient, fpr) pf := &PodFact{name: pn, isPodRunning: true} - Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).ShouldNot(Succeed()) + // Run with no annotation + Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).ShouldNot(Succeed()) + // Run with 11.0.2 annotation + vdb.Annotations[vapi.VersionAnnotation] = version.NodesHaveReadOnlyStateVersion + fpr.Results[pn] = []cmds.CmdResult{{Err: errors.New("unexpected error"), Stderr: "unknown error"}} + Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).ShouldNot(Succeed()) + }) + + It("checkIfNodeIsUpAndReadOnly should check for read-only on 11.0.2 servers", func() { + vdb := vapi.MakeVDB() + vdb.Annotations[vapi.VersionAnnotation] = version.NodesHaveReadOnlyStateVersion + pn := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0) + fpr := &cmds.FakePodRunner{ + Results: cmds.CmdResults{ + pn: []cmds.CmdResult{ + {Stdout: "UP|t"}, + }, + }, + } + pfs := MakePodFacts(k8sClient, fpr) + pf := &PodFact{name: pn, isPodRunning: true} + Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed()) + Expect(pf.upNode).Should(BeTrue()) + Expect(pf.readOnly).Should(BeTrue()) }) It("should parse out the compat21 node name from install indicator file", func() { @@ -235,4 +259,22 @@ var _ = Describe("podfacts", func() { Expect(pf.isInstalled).Should(Equal(tristate.True)) Expect(pf.compat21NodeName).Should(Equal("node0010")) }) + + It("should parse read-only state from node query", func() { + upNode1, readOnly1, err := parseNodeStateAndReadOnly("UP|t\n") + Expect(err).Should(Succeed()) + Expect(upNode1).Should(BeTrue()) + Expect(readOnly1).Should(BeTrue()) + + upNode2, readOnly2, err := parseNodeStateAndReadOnly("UP|f\n") + Expect(err).Should(Succeed()) + Expect(upNode2).Should(BeTrue()) + Expect(readOnly2).Should(BeFalse()) + + _, _, err = parseNodeStateAndReadOnly("") + Expect(err).ShouldNot(Succeed()) + + _, _, err = parseNodeStateAndReadOnly("UP|z|garbage") + Expect(err).ShouldNot(Succeed()) + }) }) diff --git a/pkg/version/version.go b/pkg/version/version.go index e26f50ef6..9d6577b21 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -34,6 +34,8 @@ const ( // If the vertica image that we deploy is older than this then the operator // abort the reconiliation process. MinimumVersion = "v11.0.0" + // The version that added read-only state + NodesHaveReadOnlyStateVersion = "v11.0.2" ) // MakeInfo will construct an Info struct by extracting the version from the From 6c4f1a31cb58d4313de1bf07e665f967835e5c08 Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 10:20:32 -0400 Subject: [PATCH 2/8] Handle read-only nodes during restart --- pkg/controllers/podfacts.go | 29 +++++++++---- pkg/controllers/restart_reconcile.go | 50 ++++++---------------- pkg/controllers/restart_reconciler_test.go | 30 +++++-------- 3 files changed, 45 insertions(+), 64 deletions(-) diff --git a/pkg/controllers/podfacts.go b/pkg/controllers/podfacts.go index 7e02979d9..02c810215 100644 --- a/pkg/controllers/podfacts.go +++ b/pkg/controllers/podfacts.go @@ -386,8 +386,8 @@ func (p *PodFacts) queryNodeStatus(ctx context.Context, pf *PodFact) error { cmd := []string{ "-tAc", "select node_state, is_readonly " + - "from sessions as a, nodes as b " + - "where a.session_id = current_session() and a.node_name = b.node_name", + "from nodes " + + "where node_name in (select node_name from current_session)", } if stdout, stderr, err := p.PRunner.ExecVSQL(ctx, pf.name, names.ServerContainer, cmd...); err != nil { if !strings.Contains(stderr, "vsql: could not connect to server:") { @@ -531,9 +531,11 @@ func (p *PodFacts) findRunningPod() (*PodFact, bool) { // findRestartablePods returns a list of pod facts that can be restarted. // An empty list implies there are no pods that need to be restarted. +// We treat read-only nodes as being restartable because they are in the +// read-only state due to losing of cluster quorum. func (p *PodFacts) findRestartablePods() []*PodFact { return p.filterPods(func(v *PodFact) bool { - return !v.upNode && v.dbExists.IsTrue() && v.isPodRunning + return (!v.upNode || v.readOnly) && v.dbExists.IsTrue() && v.isPodRunning }) } @@ -618,13 +620,24 @@ func (p *PodFacts) countNotRunning() int { // getUpNodeCount returns the number of up nodes. // A pod is considered down if it doesn't have a running vertica process. func (p *PodFacts) getUpNodeCount() int { - var count = 0 - for _, v := range p.Detail { + return p.countPods(func(v *PodFact) int { if v.upNode { - count++ + return 1 } - } - return count + return 0 + }) +} + +// getUpNodeAndNotReadOnlyCount returns the number of nodes that are up and +// writable. Starting in 11.0SP2, nodes can be up but only in read-only state. +// This function filters out those *up* nodes that are in read-only state. +func (p *PodFacts) getUpNodeAndNotReadOnlyCount() int { + return p.countPods(func(v *PodFact) int { + if v.upNode && !v.readOnly { + return 1 + } + return 0 + }) } // genPodNames will generate a string of pods names given a list of pods diff --git a/pkg/controllers/restart_reconcile.go b/pkg/controllers/restart_reconcile.go index 1c6de27f6..dbd00c4d3 100644 --- a/pkg/controllers/restart_reconcile.go +++ b/pkg/controllers/restart_reconcile.go @@ -87,7 +87,8 @@ func (r *RestartReconciler) Reconcile(ctx context.Context, req *ctrl.Request) (c // admintools commands to run. Cluster operations only apply if the entire // vertica cluster is managed by k8s. We skip that if initPolicy is // ScheduleOnly. - if r.PFacts.getUpNodeCount() == 0 && r.Vdb.Spec.InitPolicy != vapi.CommunalInitPolicyScheduleOnly { + if r.PFacts.getUpNodeAndNotReadOnlyCount() == 0 && + r.Vdb.Spec.InitPolicy != vapi.CommunalInitPolicyScheduleOnly { return r.reconcileCluster(ctx) } return r.reconcileNodes(ctx) @@ -113,34 +114,25 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result, return ctrl.Result{Requeue: true}, nil } + // Kill any vertica process that may still be running. This includes a rogue + // process that is no longer communicating with spread and process for + // read-only nodes. This is needed before re_ip, as re_ip can only work if + // the database isn't running, which would be case if there are read-only + // nodes. + downPods := r.PFacts.findRestartablePods() + if err := r.killOldProcesses(ctx, downPods); err != nil { + return ctrl.Result{}, err + } + // re_ip/start_db require all pods to be running that have run the // installation. This check is done when we generate the map file // (genMapFile). - // - // We do the re-ip before checking if nodes are done as it greatly speeds - // that part up if we use the new IPs. if res, err := r.reipNodes(ctx, r.PFacts.findReIPPods(false)); err != nil || res.Requeue { return res, err } - dbDoesNotExist := !r.PFacts.doesDBExist().IsTrue() - // Some pods may not be considered down yet. Do this only if know a db - // was created. Otherwise, this could fail if run when no db exists. - if !dbDoesNotExist { - if upNodes, err := r.anyUpNodesInClusterState(ctx); err != nil || upNodes { - return ctrl.Result{Requeue: upNodes}, err - } - } - - // Kill any rogue vertica process that may still be running. Vertica thinks - // the nodes are down, so any left over process can be cleaned up. - downPods := r.PFacts.findRestartablePods() - if err := r.killOldProcesses(ctx, downPods); err != nil { - return ctrl.Result{}, err - } - // If no db, there is nothing to restart so we can exit. - if dbDoesNotExist { + if !r.PFacts.doesDBExist().IsTrue() { return ctrl.Result{}, nil } @@ -229,22 +221,6 @@ func (r *RestartReconciler) restartPods(ctx context.Context, pods []*PodFact) (c return ctrl.Result{Requeue: len(pods) > len(downPods)}, nil } -// anyUpNodesInClusterState will make sure there are no up nodes in the cluster state. -// The cluster state refer to the state returned from AT -t list_allnodes. If at -// least one node is up, then this returns true. -func (r *RestartReconciler) anyUpNodesInClusterState(ctx context.Context) (bool, error) { - clusterState, err := r.fetchClusterNodeStatus(ctx) - if err != nil { - return false, err - } - for _, state := range clusterState { - if state == StateUp { - return true, nil - } - } - return false, nil -} - // removePodsWithClusterUpState will see if the pods in the down list are // down according to the cluster state. This will return a new pod list with the // pods that aren't considered down removed. diff --git a/pkg/controllers/restart_reconciler_test.go b/pkg/controllers/restart_reconciler_test.go index 7e34634ee..f7af159f1 100644 --- a/pkg/controllers/restart_reconciler_test.go +++ b/pkg/controllers/restart_reconciler_test.go @@ -356,9 +356,9 @@ var _ = Describe("restart_reconciler", func() { Expect(n2).Should(Equal("DOWN")) }) - It("should avoid start_db since cluster state still says a host is up", func() { + It("should do full cluster restart if all up nodes are read-only", func() { vdb := vapi.MakeVDB() - vdb.Spec.Subclusters[0].Size = 2 + vdb.Spec.Subclusters[0].Size = 3 vdb.Spec.DBName = "db" createVdb(ctx, vdb) defer deleteVdb(ctx, vdb) @@ -367,25 +367,17 @@ var _ = Describe("restart_reconciler", func() { defer deletePods(ctx, vdb) fpr := &cmds.FakePodRunner{Results: make(cmds.CmdResults)} - pfacts := createPodFactsWithRestartNeeded(ctx, vdb, sc, fpr, []int32{0, 1}) - setVerticaNodeNameInPodFacts(vdb, sc, pfacts) - - atPod := names.GenPodName(vdb, sc, 3) - fpr.Results[atPod] = []cmds.CmdResult{ - {}, // re-ip command - {Stdout: " Node | Host | State | Version | DB \n" + - "---------------+------------+-------+-------------------------+----\n" + - " v_db_node0001 | 10.244.1.6 | UP | vertica-11.0.0.20210309 | d \n" + - " v_db_node0002 | 10.244.1.7 | DOWN | vertica-11.0.0.20210309 | d \n" + - "\n", - }, + pfacts := MakePodFacts(k8sClient, fpr) + Expect(pfacts.Collect(ctx, vdb)).Should(Succeed()) + for podIndex := int32(0); podIndex < vdb.Spec.Subclusters[0].Size; podIndex++ { + downPodNm := names.GenPodName(vdb, sc, podIndex) + pfacts.Detail[downPodNm].upNode = true + pfacts.Detail[downPodNm].readOnly = true } - act := MakeRestartReconciler(vrec, logger, vdb, fpr, pfacts) - r := act.(*RestartReconciler) - r.ATPod = atPod - Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true})) - listCmd := fpr.FindCommands("list_allnodes") + r := MakeRestartReconciler(vrec, logger, vdb, fpr, &pfacts) + Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{})) + listCmd := fpr.FindCommands("start_db") Expect(len(listCmd)).Should(Equal(1)) }) From 30c8834a28f48a7e2a9644ce3b3f1da34bc7e15c Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 10:22:42 -0400 Subject: [PATCH 3/8] restart test to include UID in path --- tests/e2e/restart/setup-vdb/base/setup-vdb.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/restart/setup-vdb/base/setup-vdb.yaml b/tests/e2e/restart/setup-vdb/base/setup-vdb.yaml index 5d33d2d35..1fd04acf8 100644 --- a/tests/e2e/restart/setup-vdb/base/setup-vdb.yaml +++ b/tests/e2e/restart/setup-vdb/base/setup-vdb.yaml @@ -19,7 +19,7 @@ spec: image: kustomize-vertica-image superuserPasswordSecret: su-passwd communal: - includeUIDInPath: false + includeUIDInPath: true local: requestSize: 100Mi dbName: spilchendb2 From 762121a748fa140d64bb62e0c5d6f4f9934902da Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 10:55:11 -0400 Subject: [PATCH 4/8] Add readonly status --- api/v1beta1/verticadb_types.go | 7 ++++++ pkg/controllers/restart_reconcile.go | 35 ++++++++++++++++++++-------- pkg/controllers/status_reconcile.go | 5 ++++ tests/e2e/restart/30-assert.yaml | 1 + 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/api/v1beta1/verticadb_types.go b/api/v1beta1/verticadb_types.go index 550f19bd4..21880becc 100644 --- a/api/v1beta1/verticadb_types.go +++ b/api/v1beta1/verticadb_types.go @@ -593,6 +593,10 @@ type SubclusterStatus struct { // A count of the number of pods that have a running vertica process in this subcluster. UpNodeCount int32 `json:"upNodeCount"` + // +operator-sdk:csv:customresourcedefinitions:type=status + // A count of the number of pods that are in read-only state in this subcluster. + ReadOnlyCount int32 `json:"readOnlyCount"` + // +operator-sdk:csv:customresourcedefinitions:type=status Detail []VerticaDBPodStatus `json:"detail"` } @@ -612,6 +616,9 @@ type VerticaDBPodStatus struct { // True means the vertica process is running on this pod and it can accept // connections on port 5433. UpNode bool `json:"upNode"` + // +operator-sdk:csv:customresourcedefinitions:type=status + // True means the vertica process on this pod is in read-only state + ReadOnly bool `json:"readOnly"` } //+kubebuilder:object:root=true diff --git a/pkg/controllers/restart_reconcile.go b/pkg/controllers/restart_reconcile.go index dbd00c4d3..5457b2db1 100644 --- a/pkg/controllers/restart_reconcile.go +++ b/pkg/controllers/restart_reconcile.go @@ -120,8 +120,8 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result, // the database isn't running, which would be case if there are read-only // nodes. downPods := r.PFacts.findRestartablePods() - if err := r.killOldProcesses(ctx, downPods); err != nil { - return ctrl.Result{}, err + if res, err := r.killOldProcesses(ctx, downPods); err != nil { + return res, err } // re_ip/start_db require all pods to be running that have run the @@ -196,8 +196,8 @@ func (r *RestartReconciler) restartPods(ctx context.Context, pods []*PodFact) (c vnodeList := genRestartVNodeList(downPods) ipList := genRestartIPList(downPods) - if err := r.killOldProcesses(ctx, downPods); err != nil { - return ctrl.Result{}, err + if res, err := r.killOldProcesses(ctx, downPods); res.Requeue || err != nil { + return res, err } debugDumpAdmintoolsConf(ctx, r.PRunner, r.ATPod) @@ -402,18 +402,33 @@ func genRestartIPList(downPods []*PodFact) []string { // killOldProcesses will remove any running vertica processes. At this point, // we have determined the nodes are down, so we are cleaning up so that it -// doesn't impact the restart. -func (r *RestartReconciler) killOldProcesses(ctx context.Context, pods []*PodFact) error { +// doesn't impact the restart. This may include killed a pod that is in the +// read-only state. For this reason, we requeue the iteration if anything is +// killed so that status is updated before starting a restart; this is done for +// the benefit of PD purposes and stability in the restart test. +func (r *RestartReconciler) killOldProcesses(ctx context.Context, pods []*PodFact) (ctrl.Result, error) { + killedAtLeastOnePid := false for _, pod := range pods { + const KillMarker = "Killing process" cmd := []string{ - "bash", "-c", "for pid in $(pgrep ^vertica$); do kill -n SIGKILL $pid; done", + "bash", "-c", + fmt.Sprintf("for pid in $(pgrep ^vertica$); do echo \"%s $pid\"; kill -n SIGKILL $pid; done", KillMarker), } // Avoid all errors since the process may not even be running - if _, _, err := r.PRunner.ExecInPod(ctx, pod.name, names.ServerContainer, cmd...); err != nil { - return err + if stdout, _, err := r.PRunner.ExecInPod(ctx, pod.name, names.ServerContainer, cmd...); err != nil { + return ctrl.Result{}, err + } else if strings.Contains(stdout, KillMarker) { + killedAtLeastOnePid = true } } - return nil + if killedAtLeastOnePid { + // We are going to requeue if killed at least one process. This is for + // the benefit of the status reconciler, so that we don't treat it as + // an up node anymore. + r.Log.Info("Requeue. Killed at least one vertica process.") + return ctrl.Result{Requeue: true}, nil + } + return ctrl.Result{}, nil } // genRestartNodeCmd returns the command to run to restart a pod diff --git a/pkg/controllers/status_reconcile.go b/pkg/controllers/status_reconcile.go index 72f141de5..c124ee3fd 100644 --- a/pkg/controllers/status_reconcile.go +++ b/pkg/controllers/status_reconcile.go @@ -109,6 +109,7 @@ func (s *StatusReconciler) calculateSubclusterStatus(ctx context.Context, sc *va continue } curStat.Detail[podIndex].UpNode = pf.upNode + curStat.Detail[podIndex].ReadOnly = pf.readOnly // We can only reliably update the status for running pods. Skip those // that we couldn't figure out to preserve their state. if !pf.isInstalled.IsNone() { @@ -125,6 +126,7 @@ func (s *StatusReconciler) calculateSubclusterStatus(ctx context.Context, sc *va curStat.InstallCount = 0 curStat.AddedToDBCount = 0 curStat.UpNodeCount = 0 + curStat.ReadOnlyCount = 0 for _, v := range curStat.Detail { if v.Installed { curStat.InstallCount++ @@ -135,6 +137,9 @@ func (s *StatusReconciler) calculateSubclusterStatus(ctx context.Context, sc *va if v.UpNode { curStat.UpNodeCount++ } + if v.ReadOnly { + curStat.ReadOnlyCount++ + } } return nil } diff --git a/tests/e2e/restart/30-assert.yaml b/tests/e2e/restart/30-assert.yaml index c30ef3fbb..6a58efde4 100644 --- a/tests/e2e/restart/30-assert.yaml +++ b/tests/e2e/restart/30-assert.yaml @@ -20,3 +20,4 @@ status: - installCount: 3 addedToDBCount: 3 upNodeCount: 3 + readOnlyCount: 0 From a954b7995688cb2e908054cef372a4be778bb645 Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 11:32:50 -0400 Subject: [PATCH 5/8] Move restart test to restart-sanity --- pkg/controllers/restart_reconcile.go | 2 +- .../{restart => restart-sanity}/00-create-communal-creds.yaml | 0 tests/e2e/{restart => restart-sanity}/06-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/06-deploy-operator.yaml | 0 tests/e2e/{restart => restart-sanity}/07-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/07-password-secret.yaml | 0 tests/e2e/{restart => restart-sanity}/10-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/10-setup.yaml | 0 tests/e2e/{restart => restart-sanity}/12-assert.yaml | 0 .../e2e/{restart => restart-sanity}/12-wait-for-create-db.yaml | 0 tests/e2e/{restart => restart-sanity}/15-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/15-kill-one-pod.yaml | 0 tests/e2e/{restart => restart-sanity}/20-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/20-wait-for-restart.yaml | 0 tests/e2e/{restart => restart-sanity}/25-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/25-kill-two-pods.yaml | 0 tests/e2e/{restart => restart-sanity}/30-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/30-wait-for-restart.yaml | 0 tests/e2e/{restart => restart-sanity}/35-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/35-kill-three-pods.yaml | 0 tests/e2e/{restart => restart-sanity}/40-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/40-wait-for-restart.yaml | 0 tests/e2e/{restart => restart-sanity}/45-errors.yaml | 0 .../e2e/{restart => restart-sanity}/45-uninstall-operator.yaml | 0 tests/e2e/{restart => restart-sanity}/50-assert.yaml | 0 tests/e2e/{restart => restart-sanity}/50-delete-crd.yaml | 0 tests/e2e/{restart => restart-sanity}/50-errors.yaml | 0 tests/e2e/{restart => restart-sanity}/README.txt | 0 .../setup-vdb/base/kustomization.yaml | 0 .../{restart => restart-sanity}/setup-vdb/base/setup-vdb.yaml | 0 30 files changed, 1 insertion(+), 1 deletion(-) rename tests/e2e/{restart => restart-sanity}/00-create-communal-creds.yaml (100%) rename tests/e2e/{restart => restart-sanity}/06-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/06-deploy-operator.yaml (100%) rename tests/e2e/{restart => restart-sanity}/07-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/07-password-secret.yaml (100%) rename tests/e2e/{restart => restart-sanity}/10-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/10-setup.yaml (100%) rename tests/e2e/{restart => restart-sanity}/12-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/12-wait-for-create-db.yaml (100%) rename tests/e2e/{restart => restart-sanity}/15-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/15-kill-one-pod.yaml (100%) rename tests/e2e/{restart => restart-sanity}/20-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/20-wait-for-restart.yaml (100%) rename tests/e2e/{restart => restart-sanity}/25-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/25-kill-two-pods.yaml (100%) rename tests/e2e/{restart => restart-sanity}/30-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/30-wait-for-restart.yaml (100%) rename tests/e2e/{restart => restart-sanity}/35-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/35-kill-three-pods.yaml (100%) rename tests/e2e/{restart => restart-sanity}/40-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/40-wait-for-restart.yaml (100%) rename tests/e2e/{restart => restart-sanity}/45-errors.yaml (100%) rename tests/e2e/{restart => restart-sanity}/45-uninstall-operator.yaml (100%) rename tests/e2e/{restart => restart-sanity}/50-assert.yaml (100%) rename tests/e2e/{restart => restart-sanity}/50-delete-crd.yaml (100%) rename tests/e2e/{restart => restart-sanity}/50-errors.yaml (100%) rename tests/e2e/{restart => restart-sanity}/README.txt (100%) rename tests/e2e/{restart => restart-sanity}/setup-vdb/base/kustomization.yaml (100%) rename tests/e2e/{restart => restart-sanity}/setup-vdb/base/setup-vdb.yaml (100%) diff --git a/pkg/controllers/restart_reconcile.go b/pkg/controllers/restart_reconcile.go index 5457b2db1..147933b99 100644 --- a/pkg/controllers/restart_reconcile.go +++ b/pkg/controllers/restart_reconcile.go @@ -120,7 +120,7 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result, // the database isn't running, which would be case if there are read-only // nodes. downPods := r.PFacts.findRestartablePods() - if res, err := r.killOldProcesses(ctx, downPods); err != nil { + if res, err := r.killOldProcesses(ctx, downPods); res.Requeue || err != nil { return res, err } diff --git a/tests/e2e/restart/00-create-communal-creds.yaml b/tests/e2e/restart-sanity/00-create-communal-creds.yaml similarity index 100% rename from tests/e2e/restart/00-create-communal-creds.yaml rename to tests/e2e/restart-sanity/00-create-communal-creds.yaml diff --git a/tests/e2e/restart/06-assert.yaml b/tests/e2e/restart-sanity/06-assert.yaml similarity index 100% rename from tests/e2e/restart/06-assert.yaml rename to tests/e2e/restart-sanity/06-assert.yaml diff --git a/tests/e2e/restart/06-deploy-operator.yaml b/tests/e2e/restart-sanity/06-deploy-operator.yaml similarity index 100% rename from tests/e2e/restart/06-deploy-operator.yaml rename to tests/e2e/restart-sanity/06-deploy-operator.yaml diff --git a/tests/e2e/restart/07-assert.yaml b/tests/e2e/restart-sanity/07-assert.yaml similarity index 100% rename from tests/e2e/restart/07-assert.yaml rename to tests/e2e/restart-sanity/07-assert.yaml diff --git a/tests/e2e/restart/07-password-secret.yaml b/tests/e2e/restart-sanity/07-password-secret.yaml similarity index 100% rename from tests/e2e/restart/07-password-secret.yaml rename to tests/e2e/restart-sanity/07-password-secret.yaml diff --git a/tests/e2e/restart/10-assert.yaml b/tests/e2e/restart-sanity/10-assert.yaml similarity index 100% rename from tests/e2e/restart/10-assert.yaml rename to tests/e2e/restart-sanity/10-assert.yaml diff --git a/tests/e2e/restart/10-setup.yaml b/tests/e2e/restart-sanity/10-setup.yaml similarity index 100% rename from tests/e2e/restart/10-setup.yaml rename to tests/e2e/restart-sanity/10-setup.yaml diff --git a/tests/e2e/restart/12-assert.yaml b/tests/e2e/restart-sanity/12-assert.yaml similarity index 100% rename from tests/e2e/restart/12-assert.yaml rename to tests/e2e/restart-sanity/12-assert.yaml diff --git a/tests/e2e/restart/12-wait-for-create-db.yaml b/tests/e2e/restart-sanity/12-wait-for-create-db.yaml similarity index 100% rename from tests/e2e/restart/12-wait-for-create-db.yaml rename to tests/e2e/restart-sanity/12-wait-for-create-db.yaml diff --git a/tests/e2e/restart/15-assert.yaml b/tests/e2e/restart-sanity/15-assert.yaml similarity index 100% rename from tests/e2e/restart/15-assert.yaml rename to tests/e2e/restart-sanity/15-assert.yaml diff --git a/tests/e2e/restart/15-kill-one-pod.yaml b/tests/e2e/restart-sanity/15-kill-one-pod.yaml similarity index 100% rename from tests/e2e/restart/15-kill-one-pod.yaml rename to tests/e2e/restart-sanity/15-kill-one-pod.yaml diff --git a/tests/e2e/restart/20-assert.yaml b/tests/e2e/restart-sanity/20-assert.yaml similarity index 100% rename from tests/e2e/restart/20-assert.yaml rename to tests/e2e/restart-sanity/20-assert.yaml diff --git a/tests/e2e/restart/20-wait-for-restart.yaml b/tests/e2e/restart-sanity/20-wait-for-restart.yaml similarity index 100% rename from tests/e2e/restart/20-wait-for-restart.yaml rename to tests/e2e/restart-sanity/20-wait-for-restart.yaml diff --git a/tests/e2e/restart/25-assert.yaml b/tests/e2e/restart-sanity/25-assert.yaml similarity index 100% rename from tests/e2e/restart/25-assert.yaml rename to tests/e2e/restart-sanity/25-assert.yaml diff --git a/tests/e2e/restart/25-kill-two-pods.yaml b/tests/e2e/restart-sanity/25-kill-two-pods.yaml similarity index 100% rename from tests/e2e/restart/25-kill-two-pods.yaml rename to tests/e2e/restart-sanity/25-kill-two-pods.yaml diff --git a/tests/e2e/restart/30-assert.yaml b/tests/e2e/restart-sanity/30-assert.yaml similarity index 100% rename from tests/e2e/restart/30-assert.yaml rename to tests/e2e/restart-sanity/30-assert.yaml diff --git a/tests/e2e/restart/30-wait-for-restart.yaml b/tests/e2e/restart-sanity/30-wait-for-restart.yaml similarity index 100% rename from tests/e2e/restart/30-wait-for-restart.yaml rename to tests/e2e/restart-sanity/30-wait-for-restart.yaml diff --git a/tests/e2e/restart/35-assert.yaml b/tests/e2e/restart-sanity/35-assert.yaml similarity index 100% rename from tests/e2e/restart/35-assert.yaml rename to tests/e2e/restart-sanity/35-assert.yaml diff --git a/tests/e2e/restart/35-kill-three-pods.yaml b/tests/e2e/restart-sanity/35-kill-three-pods.yaml similarity index 100% rename from tests/e2e/restart/35-kill-three-pods.yaml rename to tests/e2e/restart-sanity/35-kill-three-pods.yaml diff --git a/tests/e2e/restart/40-assert.yaml b/tests/e2e/restart-sanity/40-assert.yaml similarity index 100% rename from tests/e2e/restart/40-assert.yaml rename to tests/e2e/restart-sanity/40-assert.yaml diff --git a/tests/e2e/restart/40-wait-for-restart.yaml b/tests/e2e/restart-sanity/40-wait-for-restart.yaml similarity index 100% rename from tests/e2e/restart/40-wait-for-restart.yaml rename to tests/e2e/restart-sanity/40-wait-for-restart.yaml diff --git a/tests/e2e/restart/45-errors.yaml b/tests/e2e/restart-sanity/45-errors.yaml similarity index 100% rename from tests/e2e/restart/45-errors.yaml rename to tests/e2e/restart-sanity/45-errors.yaml diff --git a/tests/e2e/restart/45-uninstall-operator.yaml b/tests/e2e/restart-sanity/45-uninstall-operator.yaml similarity index 100% rename from tests/e2e/restart/45-uninstall-operator.yaml rename to tests/e2e/restart-sanity/45-uninstall-operator.yaml diff --git a/tests/e2e/restart/50-assert.yaml b/tests/e2e/restart-sanity/50-assert.yaml similarity index 100% rename from tests/e2e/restart/50-assert.yaml rename to tests/e2e/restart-sanity/50-assert.yaml diff --git a/tests/e2e/restart/50-delete-crd.yaml b/tests/e2e/restart-sanity/50-delete-crd.yaml similarity index 100% rename from tests/e2e/restart/50-delete-crd.yaml rename to tests/e2e/restart-sanity/50-delete-crd.yaml diff --git a/tests/e2e/restart/50-errors.yaml b/tests/e2e/restart-sanity/50-errors.yaml similarity index 100% rename from tests/e2e/restart/50-errors.yaml rename to tests/e2e/restart-sanity/50-errors.yaml diff --git a/tests/e2e/restart/README.txt b/tests/e2e/restart-sanity/README.txt similarity index 100% rename from tests/e2e/restart/README.txt rename to tests/e2e/restart-sanity/README.txt diff --git a/tests/e2e/restart/setup-vdb/base/kustomization.yaml b/tests/e2e/restart-sanity/setup-vdb/base/kustomization.yaml similarity index 100% rename from tests/e2e/restart/setup-vdb/base/kustomization.yaml rename to tests/e2e/restart-sanity/setup-vdb/base/kustomization.yaml diff --git a/tests/e2e/restart/setup-vdb/base/setup-vdb.yaml b/tests/e2e/restart-sanity/setup-vdb/base/setup-vdb.yaml similarity index 100% rename from tests/e2e/restart/setup-vdb/base/setup-vdb.yaml rename to tests/e2e/restart-sanity/setup-vdb/base/setup-vdb.yaml From 2d173477fd42e243f0570829dbd674bcd371a802 Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 11:45:28 -0400 Subject: [PATCH 6/8] Apply review comment --- pkg/controllers/restart_reconcile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controllers/restart_reconcile.go b/pkg/controllers/restart_reconcile.go index 147933b99..cda2bf3aa 100644 --- a/pkg/controllers/restart_reconcile.go +++ b/pkg/controllers/restart_reconcile.go @@ -402,7 +402,7 @@ func genRestartIPList(downPods []*PodFact) []string { // killOldProcesses will remove any running vertica processes. At this point, // we have determined the nodes are down, so we are cleaning up so that it -// doesn't impact the restart. This may include killed a pod that is in the +// doesn't impact the restart. This may include killing a pod that is in the // read-only state. For this reason, we requeue the iteration if anything is // killed so that status is updated before starting a restart; this is done for // the benefit of PD purposes and stability in the restart test. From b0dfdc0949674271788c82ddbfe87ae879ce2b3b Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 12:05:43 -0400 Subject: [PATCH 7/8] Add changie --- changes/unreleased/Fixed-20211206-120312.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 changes/unreleased/Fixed-20211206-120312.yaml diff --git a/changes/unreleased/Fixed-20211206-120312.yaml b/changes/unreleased/Fixed-20211206-120312.yaml new file mode 100755 index 000000000..6d229eddc --- /dev/null +++ b/changes/unreleased/Fixed-20211206-120312.yaml @@ -0,0 +1,5 @@ +kind: Fixed +body: Restart of a cluster that has nodes in read-only state. This is needed to run the operator with Vertica version 11.0.2 or newer. +time: 2021-12-06T12:03:12.188403858-04:00 +custom: + Issue: "113" From 63fe66746ec728172e03bb83c810332033d9ae02 Mon Sep 17 00:00:00 2001 From: Matt Spilchen Date: Mon, 6 Dec 2021 16:00:06 -0400 Subject: [PATCH 8/8] Apply review comments --- pkg/controllers/podfacts.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/podfacts.go b/pkg/controllers/podfacts.go index 02c810215..bf99d2e7d 100644 --- a/pkg/controllers/podfacts.go +++ b/pkg/controllers/podfacts.go @@ -370,11 +370,12 @@ func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, pf *PodFact) error { return err } pf.upNode = false - pf.readOnly = false } else { pf.upNode = true - pf.readOnly = false } + // This is called for server versions that don't have read-only state. So + // read-only will always be false. + pf.readOnly = false return nil }