Skip to content

Commit

Permalink
Fix timing that causes db add node before install (#411)
Browse files Browse the repository at this point in the history
There is a small timing window with the operator where we would attempt
to do an add node before a pod has run install. This isn't a huge
problem as the add node would fail and do things in the correct order in
the subsequent reconcile iteration. But fixing this will cut down on the
amount of noise in the operator log.

The fix is to check for the install state when we run add node. If we
find a pod with a missing db and it doesn't have an install, we will
requeue rather than attempting the add node.
  • Loading branch information
spilchen authored May 30, 2023
1 parent b465d7c commit 90d4181
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 79 deletions.
5 changes: 5 additions & 0 deletions changes/unreleased/Fixed-20230529-084329.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
kind: Fixed
body: Fix timing that causes db add node before install
time: 2023-05-29T08:43:29.63332563-03:00
custom:
Issue: "411"
46 changes: 36 additions & 10 deletions pkg/controllers/vdb/dbaddnode_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package vdb

import (
"context"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -73,21 +74,46 @@ func (d *DBAddNodeReconciler) Reconcile(ctx context.Context, req *ctrl.Request)
return ctrl.Result{}, nil
}

// findAddNodePods will return a list of pods facts that require add node.
// It will only return pods if all of the ones with a missing DB can run add node.
// If at least one pod is found that is unavailable for add node, the search
// will abort. The list will be ordered by pod index.
func (d *DBAddNodeReconciler) findAddNodePods(scName string) ([]*PodFact, ctrl.Result) {
podList := []*PodFact{}
for _, v := range d.PFacts.Detail {
if v.subclusterName != scName {
continue
}
if !v.dbExists {
if !v.isPodRunning || !v.isInstalled {
// We want to group all of the add nodes in a single admintools call.
// Doing so limits the impact on any running queries. So if there is at
// least one pod that cannot run add node, we requeue until that pod is
// available before we proceed with the admintools call.
d.Log.Info("Requeue add node because some pods were not available", "pod", v.name, "isPodRunning",
v.isPodRunning, "installed", v.isInstalled)
return nil, ctrl.Result{Requeue: true}
}
podList = append(podList, v)
}
}
// Return an ordered list by pod index for easier debugging
sort.Slice(podList, func(i, j int) bool {
return podList[i].dnsName < podList[j].dnsName
})
return podList, ctrl.Result{}
}

// reconcileSubcluster will reconcile a single subcluster. Add node will be
// triggered if we determine that it hasn't been run.
func (d *DBAddNodeReconciler) reconcileSubcluster(ctx context.Context, sc *vapi.Subcluster) (ctrl.Result, error) {
addNodePods, somePodsNotRunning := d.PFacts.findPodsWithMissingDB(sc.Name)

// We want to group all of the add nodes in a single admintools call.
// Doing so limits the impact on any running queries. So if there is at
// least one pod with unknown state, we requeue until that pod is
// running before we proceed with the admintools call.
if somePodsNotRunning {
d.Log.Info("Requeue add node because some pods were not running")
return ctrl.Result{Requeue: true}, nil
addNodePods, res := d.findAddNodePods(sc.Name)
if verrors.IsReconcileAborted(res, nil) {
return res, nil
}
if len(addNodePods) > 0 {
res, err := d.runAddNode(ctx, addNodePods)
var err error
res, err = d.runAddNode(ctx, addNodePods)
return res, err
}
return ctrl.Result{}, nil
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/vdb/dbaddnode_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,20 @@ var _ = Describe("dbaddnode_reconcile", func() {
pfacts.Detail[podWithNoDB].dbExists = false
pfacts.Detail[podWithNoDB].upNode = false
pfacts.Detail[podWithNoDB].isPodRunning = false
pfacts.Detail[podWithNoDB].isInstalled = false
r := MakeDBAddNodeReconciler(vdbRec, logger, vdb, fpr, pfacts)
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(0))

// Retry reconcile but make pod running
// Retry reconcile but make pod running. This should still fail because
// install hasn't completed.
pfacts.Detail[podWithNoDB].isPodRunning = true
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(0))

pfacts.Detail[podWithNoDB].isInstalled = true
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{}))
lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(1))
Expand Down
26 changes: 0 additions & 26 deletions pkg/controllers/vdb/podfacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -778,32 +778,6 @@ func (p *PodFacts) doesDBExist() bool {
return false
}

// findPodsWithMisstingDB will return a list of pods facts that have a missing DB.
// It will only return pods that are running and that match the given
// subcluster. If no pods are found an empty list is returned. The list will be
// ordered by pod index. We also return a bool indicating if at least one pod
// that has a missing DB wasn't running.
func (p *PodFacts) findPodsWithMissingDB(scName string) ([]*PodFact, bool) {
podsHaveMissingDBAndNotRunning := false
hostList := []*PodFact{}
for _, v := range p.Detail {
if v.subclusterName != scName {
continue
}
if !v.dbExists {
if !v.isPodRunning {
podsHaveMissingDBAndNotRunning = true
}
hostList = append(hostList, v)
}
}
// Return an ordered list by pod index for easier debugging
sort.Slice(hostList, func(i, j int) bool {
return hostList[i].dnsName < hostList[j].dnsName
})
return hostList, podsHaveMissingDBAndNotRunning
}

// findPodToRunVsql returns the name of the pod we will exec into in
// order to run vsql
// Will return false for second parameter if no pod could be found.
Expand Down
42 changes: 0 additions & 42 deletions pkg/controllers/vdb/podfacts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,48 +100,6 @@ var _ = Describe("podfacts", func() {
Expect(pf.doesDBExist()).Should(BeTrue())
})

It("should verify findPodsWithMissingDB return codes", func() {
pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{dbExists: true, subclusterName: "sc1", isPodRunning: true}
pods, somePodsNotRunning := pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(0))
Expect(somePodsNotRunning).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: false, subclusterName: "sc1", isPodRunning: true}
pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(1))
Expect(somePodsNotRunning).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: false, subclusterName: "sc2", isPodRunning: false}
pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc2")
Expect(len(pods)).Should(Equal(1))
Expect(somePodsNotRunning).Should(Equal(true))
})

It("should verify return of findPodsWithMissingDB", func() {
pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{
dnsName: "p1", subclusterName: "sc1", dbExists: true,
}
pf.Detail[types.NamespacedName{Name: "p2"}] = &PodFact{
dnsName: "p2", subclusterName: "sc1", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{
dnsName: "p3", subclusterName: "sc1", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{
dnsName: "p4", subclusterName: "sc2", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p5"}] = &PodFact{
dnsName: "p5", subclusterName: "sc2", dbExists: false,
}
pods, _ := pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(2))
Expect(pods[0].dnsName).Should(Equal("p2"))
pods, _ = pf.findPodsWithMissingDB("sc2")
Expect(len(pods)).Should(Equal(2))
Expect(pods[0].dnsName).Should(Equal("p4"))
Expect(pods[1].dnsName).Should(Equal("p5"))
})

It("should verify return of countNotReadOnlyWithOldImage", func() {
const OldImage = "image:v1"
const NewImage = "image:v2"
Expand Down

0 comments on commit 90d4181

Please sign in to comment.