diff --git a/changes/unreleased/Fixed-20230529-084329.yaml b/changes/unreleased/Fixed-20230529-084329.yaml new file mode 100644 index 000000000..cbae9b4a2 --- /dev/null +++ b/changes/unreleased/Fixed-20230529-084329.yaml @@ -0,0 +1,5 @@ +kind: Fixed +body: Fix timing that causes db add node before install +time: 2023-05-29T08:43:29.63332563-03:00 +custom: + Issue: "411" diff --git a/pkg/controllers/vdb/dbaddnode_reconciler.go b/pkg/controllers/vdb/dbaddnode_reconciler.go index 980973aa9..2477e46b7 100644 --- a/pkg/controllers/vdb/dbaddnode_reconciler.go +++ b/pkg/controllers/vdb/dbaddnode_reconciler.go @@ -17,6 +17,7 @@ package vdb import ( "context" + "sort" "strings" "time" @@ -73,21 +74,46 @@ func (d *DBAddNodeReconciler) Reconcile(ctx context.Context, req *ctrl.Request) return ctrl.Result{}, nil } +// findAddNodePods will return a list of pods facts that require add node. +// It will only return pods if all of the ones with a missing DB can run add node. +// If at least one pod is found that is unavailable for add node, the search +// will abort. The list will be ordered by pod index. +func (d *DBAddNodeReconciler) findAddNodePods(scName string) ([]*PodFact, ctrl.Result) { + podList := []*PodFact{} + for _, v := range d.PFacts.Detail { + if v.subclusterName != scName { + continue + } + if !v.dbExists { + if !v.isPodRunning || !v.isInstalled { + // We want to group all of the add nodes in a single admintools call. + // Doing so limits the impact on any running queries. So if there is at + // least one pod that cannot run add node, we requeue until that pod is + // available before we proceed with the admintools call. + d.Log.Info("Requeue add node because some pods were not available", "pod", v.name, "isPodRunning", + v.isPodRunning, "installed", v.isInstalled) + return nil, ctrl.Result{Requeue: true} + } + podList = append(podList, v) + } + } + // Return an ordered list by pod index for easier debugging + sort.Slice(podList, func(i, j int) bool { + return podList[i].dnsName < podList[j].dnsName + }) + return podList, ctrl.Result{} +} + // reconcileSubcluster will reconcile a single subcluster. Add node will be // triggered if we determine that it hasn't been run. func (d *DBAddNodeReconciler) reconcileSubcluster(ctx context.Context, sc *vapi.Subcluster) (ctrl.Result, error) { - addNodePods, somePodsNotRunning := d.PFacts.findPodsWithMissingDB(sc.Name) - - // We want to group all of the add nodes in a single admintools call. - // Doing so limits the impact on any running queries. So if there is at - // least one pod with unknown state, we requeue until that pod is - // running before we proceed with the admintools call. - if somePodsNotRunning { - d.Log.Info("Requeue add node because some pods were not running") - return ctrl.Result{Requeue: true}, nil + addNodePods, res := d.findAddNodePods(sc.Name) + if verrors.IsReconcileAborted(res, nil) { + return res, nil } if len(addNodePods) > 0 { - res, err := d.runAddNode(ctx, addNodePods) + var err error + res, err = d.runAddNode(ctx, addNodePods) return res, err } return ctrl.Result{}, nil diff --git a/pkg/controllers/vdb/dbaddnode_reconciler_test.go b/pkg/controllers/vdb/dbaddnode_reconciler_test.go index fd9dcb63e..ab9e5d18c 100644 --- a/pkg/controllers/vdb/dbaddnode_reconciler_test.go +++ b/pkg/controllers/vdb/dbaddnode_reconciler_test.go @@ -149,13 +149,20 @@ var _ = Describe("dbaddnode_reconcile", func() { pfacts.Detail[podWithNoDB].dbExists = false pfacts.Detail[podWithNoDB].upNode = false pfacts.Detail[podWithNoDB].isPodRunning = false + pfacts.Detail[podWithNoDB].isInstalled = false r := MakeDBAddNodeReconciler(vdbRec, logger, vdb, fpr, pfacts) Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true})) lastCall := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node") Expect(len(lastCall)).Should(Equal(0)) - // Retry reconcile but make pod running + // Retry reconcile but make pod running. This should still fail because + // install hasn't completed. pfacts.Detail[podWithNoDB].isPodRunning = true + Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true})) + lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node") + Expect(len(lastCall)).Should(Equal(0)) + + pfacts.Detail[podWithNoDB].isInstalled = true Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{})) lastCall = fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node") Expect(len(lastCall)).Should(Equal(1)) diff --git a/pkg/controllers/vdb/podfacts.go b/pkg/controllers/vdb/podfacts.go index 4ef9483ef..aa597c016 100644 --- a/pkg/controllers/vdb/podfacts.go +++ b/pkg/controllers/vdb/podfacts.go @@ -778,32 +778,6 @@ func (p *PodFacts) doesDBExist() bool { return false } -// findPodsWithMisstingDB will return a list of pods facts that have a missing DB. -// It will only return pods that are running and that match the given -// subcluster. If no pods are found an empty list is returned. The list will be -// ordered by pod index. We also return a bool indicating if at least one pod -// that has a missing DB wasn't running. -func (p *PodFacts) findPodsWithMissingDB(scName string) ([]*PodFact, bool) { - podsHaveMissingDBAndNotRunning := false - hostList := []*PodFact{} - for _, v := range p.Detail { - if v.subclusterName != scName { - continue - } - if !v.dbExists { - if !v.isPodRunning { - podsHaveMissingDBAndNotRunning = true - } - hostList = append(hostList, v) - } - } - // Return an ordered list by pod index for easier debugging - sort.Slice(hostList, func(i, j int) bool { - return hostList[i].dnsName < hostList[j].dnsName - }) - return hostList, podsHaveMissingDBAndNotRunning -} - // findPodToRunVsql returns the name of the pod we will exec into in // order to run vsql // Will return false for second parameter if no pod could be found. diff --git a/pkg/controllers/vdb/podfacts_test.go b/pkg/controllers/vdb/podfacts_test.go index 27a0928f3..a341412ce 100644 --- a/pkg/controllers/vdb/podfacts_test.go +++ b/pkg/controllers/vdb/podfacts_test.go @@ -100,48 +100,6 @@ var _ = Describe("podfacts", func() { Expect(pf.doesDBExist()).Should(BeTrue()) }) - It("should verify findPodsWithMissingDB return codes", func() { - pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{}) - pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{dbExists: true, subclusterName: "sc1", isPodRunning: true} - pods, somePodsNotRunning := pf.findPodsWithMissingDB("sc1") - Expect(len(pods)).Should(Equal(0)) - Expect(somePodsNotRunning).Should(Equal(false)) - pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: false, subclusterName: "sc1", isPodRunning: true} - pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc1") - Expect(len(pods)).Should(Equal(1)) - Expect(somePodsNotRunning).Should(Equal(false)) - pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: false, subclusterName: "sc2", isPodRunning: false} - pods, somePodsNotRunning = pf.findPodsWithMissingDB("sc2") - Expect(len(pods)).Should(Equal(1)) - Expect(somePodsNotRunning).Should(Equal(true)) - }) - - It("should verify return of findPodsWithMissingDB", func() { - pf := MakePodFacts(vdbRec, &cmds.FakePodRunner{}) - pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{ - dnsName: "p1", subclusterName: "sc1", dbExists: true, - } - pf.Detail[types.NamespacedName{Name: "p2"}] = &PodFact{ - dnsName: "p2", subclusterName: "sc1", dbExists: false, - } - pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{ - dnsName: "p3", subclusterName: "sc1", dbExists: false, - } - pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{ - dnsName: "p4", subclusterName: "sc2", dbExists: false, - } - pf.Detail[types.NamespacedName{Name: "p5"}] = &PodFact{ - dnsName: "p5", subclusterName: "sc2", dbExists: false, - } - pods, _ := pf.findPodsWithMissingDB("sc1") - Expect(len(pods)).Should(Equal(2)) - Expect(pods[0].dnsName).Should(Equal("p2")) - pods, _ = pf.findPodsWithMissingDB("sc2") - Expect(len(pods)).Should(Equal(2)) - Expect(pods[0].dnsName).Should(Equal("p4")) - Expect(pods[1].dnsName).Should(Equal("p5")) - }) - It("should verify return of countNotReadOnlyWithOldImage", func() { const OldImage = "image:v1" const NewImage = "image:v2"