Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restart to handle read-only Vertica nodes #113

Merged
merged 8 commits into from
Dec 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions api/v1beta1/verticadb_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,10 @@ type SubclusterStatus struct {
// A count of the number of pods that have a running vertica process in this subcluster.
UpNodeCount int32 `json:"upNodeCount"`

// +operator-sdk:csv:customresourcedefinitions:type=status
// A count of the number of pods that are in read-only state in this subcluster.
ReadOnlyCount int32 `json:"readOnlyCount"`

// +operator-sdk:csv:customresourcedefinitions:type=status
Detail []VerticaDBPodStatus `json:"detail"`
}
Expand All @@ -612,6 +616,9 @@ type VerticaDBPodStatus struct {
// True means the vertica process is running on this pod and it can accept
// connections on port 5433.
UpNode bool `json:"upNode"`
// +operator-sdk:csv:customresourcedefinitions:type=status
// True means the vertica process on this pod is in read-only state
ReadOnly bool `json:"readOnly"`
}

//+kubebuilder:object:root=true
Expand Down
5 changes: 5 additions & 0 deletions changes/unreleased/Fixed-20211206-120312.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
kind: Fixed
body: Restart of a cluster that has nodes in read-only state. This is needed to run the operator with Vertica version 11.0.2 or newer.
time: 2021-12-06T12:03:12.188403858-04:00
custom:
Issue: "113"
96 changes: 87 additions & 9 deletions pkg/controllers/podfacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/vertica/vertica-kubernetes/pkg/cmds"
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/paths"
"github.com/vertica/vertica-kubernetes/pkg/version"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -76,6 +77,9 @@ type PodFact struct {
// port 5433.
upNode bool

// true means the node is up, but in read-only state
readOnly bool

// The vnode name that Vertica assigned to this pod.
vnodeName string

Expand Down Expand Up @@ -197,7 +201,7 @@ func (p *PodFacts) collectPodByStsIndex(ctx context.Context, vdb *vapi.VerticaDB
fns := []func(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error{
p.checkIsInstalled,
p.checkIsDBCreated,
p.checkIfNodeIsUp,
p.checkIfNodeIsUpAndReadOnly,
p.checkEulaAcceptance,
p.checkLogrotateExists,
p.checkIsLogrotateWritable,
Expand Down Expand Up @@ -333,13 +337,30 @@ func (p *PodFacts) checkIsDBCreated(ctx context.Context, vdb *vapi.VerticaDB, pf
return nil
}

// checkIfNodeIsUp will determine whether Vertica process is running in the pod.
func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error {
// checkIfNodeIsUpAndReadOnly will determine whether Vertica process is running
// in the pod and whether it is in read-only mode.
func (p *PodFacts) checkIfNodeIsUpAndReadOnly(ctx context.Context, vdb *vapi.VerticaDB, pf *PodFact) error {
if pf.dbExists.IsFalse() || !pf.isPodRunning {
pf.upNode = false
pf.readOnly = false
return nil
}

// The read-only state is a new state added in 11.0.2. So we can only query
// for it on levels 11.0.2+. Otherwise, we always treat read-only as being
// disabled.
vinf, ok := version.MakeInfo(vdb)
if ok && vinf.IsEqualOrNewer(version.NodesHaveReadOnlyStateVersion) {
return p.queryNodeStatus(ctx, pf)
}
return p.checkIfNodeIsUp(ctx, pf)
}

// checkIfNodeIsUp will check if the Vertica is up and running in this process.
// It assumes the pod is running and the database exists. It doesn't check for
// read-only state. This exists for backwards compatibility of versions older
// than 11.0.2.
func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, pf *PodFact) error {
cmd := []string{
"-c",
"select 1",
Expand All @@ -352,10 +373,54 @@ func (p *PodFacts) checkIfNodeIsUp(ctx context.Context, vdb *vapi.VerticaDB, pf
} else {
pf.upNode = true
}
// This is called for server versions that don't have read-only state. So
// read-only will always be false.
pf.readOnly = false

return nil
}

// queryNodeStatus will query the nodes system table to see if the node is up
// and wether it is in read-only state. It assumes the database exists and the
// pod is running.
func (p *PodFacts) queryNodeStatus(ctx context.Context, pf *PodFact) error {
cmd := []string{
"-tAc",
"select node_state, is_readonly " +
"from nodes " +
"where node_name in (select node_name from current_session)",
}
if stdout, stderr, err := p.PRunner.ExecVSQL(ctx, pf.name, names.ServerContainer, cmd...); err != nil {
if !strings.Contains(stderr, "vsql: could not connect to server:") {
return err
}
pf.upNode = false
pf.readOnly = false
} else if pf.upNode, pf.readOnly, err = parseNodeStateAndReadOnly(stdout); err != nil {
return err
}

return nil
}

// parseNodeStateAndReadOnly will parse query output to determine if a node is
// up and read-only.
func parseNodeStateAndReadOnly(stdout string) (upNode, readOnly bool, err error) {
// The stdout comes in the form like this:
// UP|t
// This means upNode is true and readOnly is true.
lines := strings.Split(stdout, "\n")
cols := strings.Split(lines[0], "|")
const ExpectedCols = 2
if len(cols) != ExpectedCols {
err = fmt.Errorf("expected %d columns from node query but only got %d", ExpectedCols, len(cols))
return
}
upNode = cols[0] == "UP"
readOnly = cols[1] == "t"
return
}

// parseVerticaNodeName extract the vertica node name from the directory list
func parseVerticaNodeName(stdout string) string {
re := regexp.MustCompile(`(v_.+_node\d+)_data`)
Expand Down Expand Up @@ -467,9 +532,11 @@ func (p *PodFacts) findRunningPod() (*PodFact, bool) {

// findRestartablePods returns a list of pod facts that can be restarted.
// An empty list implies there are no pods that need to be restarted.
// We treat read-only nodes as being restartable because they are in the
// read-only state due to losing of cluster quorum.
func (p *PodFacts) findRestartablePods() []*PodFact {
return p.filterPods(func(v *PodFact) bool {
return !v.upNode && v.dbExists.IsTrue() && v.isPodRunning
return (!v.upNode || v.readOnly) && v.dbExists.IsTrue() && v.isPodRunning
})
}

Expand Down Expand Up @@ -554,13 +621,24 @@ func (p *PodFacts) countNotRunning() int {
// getUpNodeCount returns the number of up nodes.
// A pod is considered down if it doesn't have a running vertica process.
func (p *PodFacts) getUpNodeCount() int {
var count = 0
for _, v := range p.Detail {
return p.countPods(func(v *PodFact) int {
if v.upNode {
count++
return 1
}
}
return count
return 0
})
}

// getUpNodeAndNotReadOnlyCount returns the number of nodes that are up and
// writable. Starting in 11.0SP2, nodes can be up but only in read-only state.
// This function filters out those *up* nodes that are in read-only state.
func (p *PodFacts) getUpNodeAndNotReadOnlyCount() int {
return p.countPods(func(v *PodFact) int {
if v.upNode && !v.readOnly {
return 1
}
return 0
})
}

// genPodNames will generate a string of pods names given a list of pods
Expand Down
50 changes: 46 additions & 4 deletions pkg/controllers/podfacts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
vapi "github.com/vertica/vertica-kubernetes/api/v1beta1"
"github.com/vertica/vertica-kubernetes/pkg/cmds"
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/version"
appsv1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/types"
"yunion.io/x/pkg/tristate"
Expand Down Expand Up @@ -182,7 +183,7 @@ var _ = Describe("podfacts", func() {
}
pfs := MakePodFacts(k8sClient, fpr)
pf := &PodFact{name: pn, isPodRunning: true}
Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).Should(Succeed())
Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed())
Expect(pf.upNode).Should(BeFalse())
})

Expand All @@ -198,11 +199,11 @@ var _ = Describe("podfacts", func() {
}
pfs := MakePodFacts(k8sClient, fpr)
pf := &PodFact{name: pn, isPodRunning: true}
Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).Should(Succeed())
Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed())
Expect(pf.upNode).Should(BeTrue())
})

It("should fail if checkIfNodeIsUp gets an unexpected error", func() {
It("should fail if checkIfNodeIsUpAndReadOnly gets an unexpected error", func() {
vdb := vapi.MakeVDB()
pn := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0)
fpr := &cmds.FakePodRunner{
Expand All @@ -214,7 +215,30 @@ var _ = Describe("podfacts", func() {
}
pfs := MakePodFacts(k8sClient, fpr)
pf := &PodFact{name: pn, isPodRunning: true}
Expect(pfs.checkIfNodeIsUp(ctx, vdb, pf)).ShouldNot(Succeed())
// Run with no annotation
Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).ShouldNot(Succeed())
// Run with 11.0.2 annotation
vdb.Annotations[vapi.VersionAnnotation] = version.NodesHaveReadOnlyStateVersion
fpr.Results[pn] = []cmds.CmdResult{{Err: errors.New("unexpected error"), Stderr: "unknown error"}}
Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).ShouldNot(Succeed())
})

It("checkIfNodeIsUpAndReadOnly should check for read-only on 11.0.2 servers", func() {
vdb := vapi.MakeVDB()
vdb.Annotations[vapi.VersionAnnotation] = version.NodesHaveReadOnlyStateVersion
pn := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0)
fpr := &cmds.FakePodRunner{
Results: cmds.CmdResults{
pn: []cmds.CmdResult{
{Stdout: "UP|t"},
},
},
}
pfs := MakePodFacts(k8sClient, fpr)
pf := &PodFact{name: pn, isPodRunning: true}
Expect(pfs.checkIfNodeIsUpAndReadOnly(ctx, vdb, pf)).Should(Succeed())
Expect(pf.upNode).Should(BeTrue())
Expect(pf.readOnly).Should(BeTrue())
})

It("should parse out the compat21 node name from install indicator file", func() {
Expand All @@ -235,4 +259,22 @@ var _ = Describe("podfacts", func() {
Expect(pf.isInstalled).Should(Equal(tristate.True))
Expect(pf.compat21NodeName).Should(Equal("node0010"))
})

It("should parse read-only state from node query", func() {
upNode1, readOnly1, err := parseNodeStateAndReadOnly("UP|t\n")
Expect(err).Should(Succeed())
Expect(upNode1).Should(BeTrue())
Expect(readOnly1).Should(BeTrue())

upNode2, readOnly2, err := parseNodeStateAndReadOnly("UP|f\n")
Expect(err).Should(Succeed())
Expect(upNode2).Should(BeTrue())
Expect(readOnly2).Should(BeFalse())

_, _, err = parseNodeStateAndReadOnly("")
Expect(err).ShouldNot(Succeed())

_, _, err = parseNodeStateAndReadOnly("UP|z|garbage")
Expect(err).ShouldNot(Succeed())
})
})
Loading