Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix timing that causes db add node before install #411

Merged
merged 2 commits into from
May 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changes/unreleased/Fixed-20230529-084329.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
kind: Fixed
body: Fix timing that causes db add node before install
time: 2023-05-29T08:43:29.63332563-03:00
custom:
Issue: "411"
46 changes: 36 additions & 10 deletions pkg/controllers/vdb/dbaddnode_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package vdb

import (
"context"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -73,21 +74,46 @@ func (d *DBAddNodeReconciler) Reconcile(ctx context.Context, req *ctrl.Request)
return ctrl.Result{}, nil
}

// findAddNodePods will return a list of pods facts that require add node.
// It will only return pods if all of the ones with a missing DB can run add node.
// If at least one pod is found that is unavailable for add node, the search
// will abort. The list will be ordered by pod index.
func (d *DBAddNodeReconciler) findAddNodePods(scName string) ([]*PodFact, ctrl.Result) {
podList := []*PodFact{}
for _, v := range d.PFacts.Detail {
if v.subclusterName != scName {
continue
}
if !v.dbExists {
if !v.isPodRunning || !v.isInstalled {
// We want to group all of the add nodes in a single admintools call.
// Doing so limits the impact on any running queries. So if there is at
// least one pod that cannot run add node, we requeue until that pod is
// available before we proceed with the admintools call.
d.Log.Info("Requeue add node because some pods were not available", "pod", v.name, "isPodRunning",
v.isPodRunning, "installed", v.isInstalled)
return nil, ctrl.Result{Requeue: true}
}
podList = append(podList, v)
}
}
// Return an ordered list by pod index for easier debugging
sort.Slice(podList, func(i, j int) bool {
return podList[i].dnsName < podList[j].dnsName
})
return podList, ctrl.Result{}
}

// reconcileSubcluster will reconcile a single subcluster. Add node will be
// triggered if we determine that it hasn't been run.
func (d *DBAddNodeReconciler) reconcileSubcluster(ctx context.Context, sc *vapi.Subcluster) (ctrl.Result, error) {
addNodePods, somePodsNotRunning := d.PFacts.findPodsWithMissingDB(sc.Name)

// We want to group all of the add nodes in a single admintools call.
// Doing so limits the impact on any running queries. So if there is at
// least one pod with unknown state, we requeue until that pod is
// running before we proceed with the admintools call.
if somePodsNotRunning {
d.Log.Info("Requeue add node because some pods were not running")
return ctrl.Result{Requeue: true}, nil
addNodePods, res := d.findAddNodePods(sc.Name)
if verrors.IsReconcileAborted(res, nil) {
return res, nil
}
if len(addNodePods) > 0 {
res, err := d.runAddNode(ctx, addNodePods)
var err error
res, err = d.runAddNode(ctx, addNodePods)
return res, err
}
return ctrl.Result{}, nil
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/vdb/dbaddnode_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,20 @@ var _ = Describe("dbaddnode_reconcile", func() {
pfacts.Detail[podWithNoDB].dbExists = false
pfacts.Detail[podWithNoDB].upNode = false
pfacts.Detail[podWithNoDB].isPodRunning = false
pfacts.Detail[podWithNoDB].isInstalled = false
r := MakeDBAddNodeReconciler(vdbRec, logger, vdb, fpr, pfacts)
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(0))

// Retry reconcile but make pod running
// Retry reconcile but make pod running. This should still fail because
// install hasn't completed.
pfacts.Detail[podWithNoDB].isPodRunning = true
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(0))

pfacts.Detail[podWithNoDB].isInstalled = true
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{}))
lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(1))
Expand Down
26 changes: 0 additions & 26 deletions pkg/controllers/vdb/podfacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -778,32 +778,6 @@ func (p *PodFacts) doesDBExist() bool {
return false
}

// findPodsWithMisstingDB will return a list of pods facts that have a missing DB.
// It will only return pods that are running and that match the given
// subcluster. If no pods are found an empty list is returned. The list will be
// ordered by pod index. We also return a bool indicating if at least one pod
// that has a missing DB wasn't running.
func (p *PodFacts) findPodsWithMissingDB(scName string) ([]*PodFact, bool) {
podsHaveMissingDBAndNotRunning := false
hostList := []*PodFact{}
for _, v := range p.Detail {
if v.subclusterName != scName {
continue
}
if !v.dbExists {
if !v.isPodRunning {
podsHaveMissingDBAndNotRunning = true
}
hostList = append(hostList, v)
}
}
// Return an ordered list by pod index for easier debugging
sort.Slice(hostList, func(i, j int) bool {
return hostList[i].dnsName < hostList[j].dnsName
})
return hostList, podsHaveMissingDBAndNotRunning
}

// findPodToRunVsql returns the name of the pod we will exec into in
// order to run vsql
// Will return false for second parameter if no pod could be found.
Expand Down
42 changes: 0 additions & 42 deletions pkg/controllers/vdb/podfacts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,48 +100,6 @@ var _ = Describe("podfacts", func() {
Expect(pf.doesDBExist()).Should(BeTrue())
})

It("should verify findPodsWithMissingDB return codes", func() {
pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{dbExists: true, subclusterName: "sc1", isPodRunning: true}
pods, somePodsNotRunning := pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(0))
Expect(somePodsNotRunning).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: false, subclusterName: "sc1", isPodRunning: true}
pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(1))
Expect(somePodsNotRunning).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: false, subclusterName: "sc2", isPodRunning: false}
pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc2")
Expect(len(pods)).Should(Equal(1))
Expect(somePodsNotRunning).Should(Equal(true))
})

It("should verify return of findPodsWithMissingDB", func() {
pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{
dnsName: "p1", subclusterName: "sc1", dbExists: true,
}
pf.Detail[types.NamespacedName{Name: "p2"}] = &PodFact{
dnsName: "p2", subclusterName: "sc1", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{
dnsName: "p3", subclusterName: "sc1", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{
dnsName: "p4", subclusterName: "sc2", dbExists: false,
}
pf.Detail[types.NamespacedName{Name: "p5"}] = &PodFact{
dnsName: "p5", subclusterName: "sc2", dbExists: false,
}
pods, _ := pf.findPodsWithMissingDB("sc1")
Expect(len(pods)).Should(Equal(2))
Expect(pods[0].dnsName).Should(Equal("p2"))
pods, _ = pf.findPodsWithMissingDB("sc2")
Expect(len(pods)).Should(Equal(2))
Expect(pods[0].dnsName).Should(Equal("p4"))
Expect(pods[1].dnsName).Should(Equal("p5"))
})

It("should verify return of countNotReadOnlyWithOldImage", func() {
const OldImage = "image:v1"
const NewImage = "image:v2"
Expand Down