Skip to content

Commit

Permalink
Online image change: changes for new admintools options in 11.1.0 (#142)
Browse files Browse the repository at this point in the history
- when running AT start_db, we need to run from one of the primary nodes.  It
  won't work if we try from a read-only node that isn't being restarted.
- when calling re_ip, use the --force option. This option is new in 11.1.0, so
  we needed conditional logic to know when we could use this.
- when calling start_db, we use the host list. This option exists first in
  11.0.1, so like reip, we needed conditional logic to know when we can use
  that option

One thing that isn't related to the title of this PR is some new logic needed
in DBAddNodeReconicler. That reconciler will now requeue if some pods aren't
yet ready. This was needed so that the upgrade properly waits for the transient
subcluster to scale out. Prior to this change, it was possible that the image
change went ahead and restarted the primaries before the transient was up. This
should be solved now.
  • Loading branch information
spilchen authored Jan 19, 2022
1 parent e7d2b2e commit fc5bc85
Show file tree
Hide file tree
Showing 11 changed files with 170 additions and 54 deletions.
14 changes: 10 additions & 4 deletions pkg/controllers/dbaddnode_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,17 @@ func (d *DBAddNodeReconciler) Reconcile(ctx context.Context, req *ctrl.Request)
// reconcileSubcluster will reconcile a single subcluster. Add node will be
// triggered if we determine that it hasn't been run.
func (d *DBAddNodeReconciler) reconcileSubcluster(ctx context.Context, sc *vapi.Subcluster) (ctrl.Result, error) {
if exists := d.PFacts.anyPodsMissingDB(sc.Name); exists.IsTrue() {
res, err := d.runAddNode(ctx, sc)
if err != nil || res.Requeue {
return res, err
if missingDB, unknownState := d.PFacts.anyPodsMissingDB(sc.Name); missingDB || unknownState {
if missingDB {
res, err := d.runAddNode(ctx, sc)
if err != nil || res.Requeue {
return res, err
}
}
if unknownState {
d.Log.Info("Requeue because some pods were not running")
}
return ctrl.Result{Requeue: unknownState}, nil
}
return ctrl.Result{}, nil
}
Expand Down
22 changes: 22 additions & 0 deletions pkg/controllers/dbaddnode_reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,26 @@ var _ = Describe("dbaddnode_reconcile", func() {
atCmd = fpr.FindCommands("select rebalance_shards('defaultsubcluster')")
Expect(len(atCmd)).Should(Equal(0))
})

It("should add node and requeue if one pod is missing db and another pod isn't running", func() {
vdb := vapi.MakeVDB()
vdb.Spec.Subclusters[0].Size = 3
createPods(ctx, vdb, AllPodsRunning)
defer deletePods(ctx, vdb)

fpr := &cmds.FakePodRunner{Results: make(cmds.CmdResults)}
pfacts := MakePodFacts(k8sClient, fpr)
Expect(pfacts.Collect(ctx, vdb)).Should(Succeed())
// Make a specific pod as not having a db.
podWithNoDB := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 1)
pfacts.Detail[podWithNoDB].dbExists = tristate.False
pfacts.Detail[podWithNoDB].upNode = false
// Make a specific pod as having an unknown state
podInUnknownState := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 2)
pfacts.Detail[podInUnknownState].dbExists = tristate.None
r := MakeDBAddNodeReconciler(vrec, logger, vdb, fpr, &pfacts)
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(1))
})
})
2 changes: 1 addition & 1 deletion pkg/controllers/dbaddsubcluster_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (d *DBAddSubclusterReconciler) Reconcile(ctx context.Context, req *ctrl.Req

// addMissingSubclusters will compare subclusters passed in and create any missing ones
func (d *DBAddSubclusterReconciler) addMissingSubclusters(ctx context.Context, scs []vapi.Subcluster) (ctrl.Result, error) {
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok || !atPod.upNode {
d.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/dbremovenode_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (d *DBRemoveNodeReconciler) removeNodesInSubcluster(ctx context.Context, sc
podsToRemove, requeueNeeded := d.findPodsSuitableForScaleDown(sc, startPodIndex, endPodIndex)
if len(podsToRemove) > 0 {
cmd := d.genCmdRemoveNode(podsToRemove)
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok {
// Requeue since we couldn't find a running pod
d.Log.Info("Requeue since we could not find a pod to run admintools")
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/dbremovesubcluster_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (d *DBRemoveSubclusterReconciler) removeExtraSubclusters(ctx context.Contex
}

if len(subclusters) > 0 {
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok || !atPod.upNode {
d.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/init_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func (g *GenericDatabaseInitializer) checkAndRunInit(ctx context.Context) (ctrl.
// runInit will physically setup the database.
// Depending on g.initializer, this will either do create_db or revive_db.
func (g *GenericDatabaseInitializer) runInit(ctx context.Context) (ctrl.Result, error) {
atPodFact, ok := g.PFacts.findPodToRunAdmintools()
atPodFact, ok := g.PFacts.findPodToRunAdmintoolsOffline()
if !ok {
// Could not find a runable pod to run from.
return ctrl.Result{Requeue: true}, nil
Expand Down
40 changes: 26 additions & 14 deletions pkg/controllers/podfacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,24 +452,25 @@ func (p *PodFacts) doesDBExist() tristate.TriState {
return returnOnFail
}

// anyPodsMissingDB will check whether each pod is added to the database
// It is a tristate return:
// - we return True if at least one runable pod doesn't have a database
// - we return False if all pods have a database
// - we return None if at least one pod we couldn't determine its state
func (p *PodFacts) anyPodsMissingDB(scName string) tristate.TriState {
returnOnFail := tristate.False
// anyPodsMissingDB will check whether each pod is added to the database.
// It returns two states:
// - missingDB is true if at least one pod was running and had a missing DB
// - unknownState is true if at least one pod we could not determine if the DB
// was there or not -- due to the pod not running
func (p *PodFacts) anyPodsMissingDB(scName string) (missingDB, unknownState bool) {
missingDB = false
unknownState = false
for _, v := range p.Detail {
if v.subcluster != scName {
continue
}
if v.dbExists.IsFalse() && v.isPodRunning {
return tristate.True
if v.dbExists.IsFalse() {
missingDB = true
} else if v.dbExists.IsNone() {
returnOnFail = tristate.None
unknownState = true
}
}
return returnOnFail
return
}

// findPodsWithMisstingDB will return a list of pods facts that have a missing DB
Expand Down Expand Up @@ -505,10 +506,10 @@ func (p *PodFacts) findPodToRunVsql() (*PodFact, bool) {
return &PodFact{}, false
}

// findPodToRunAdmintools returns the name of the pod we will exec into into
// order to run admintools
// findPodToRunAdmintoolsAny returns the name of the pod we will exec into into
// order to run admintools.
// Will return false for second parameter if no pod could be found.
func (p *PodFacts) findPodToRunAdmintools() (*PodFact, bool) {
func (p *PodFacts) findPodToRunAdmintoolsAny() (*PodFact, bool) {
// Our preference for the pod is as follows:
// - up and not read-only
// - up and read-only
Expand All @@ -531,6 +532,17 @@ func (p *PodFacts) findPodToRunAdmintools() (*PodFact, bool) {
return &PodFact{}, false
}

// findPodToRunAdmintoolsOffline will return a pod to run an offline admintools
// command. If nothing is found, the second parameter returned will be false.
func (p *PodFacts) findPodToRunAdmintoolsOffline() (*PodFact, bool) {
for _, v := range p.Detail {
if v.isInstalled.IsTrue() && v.isPodRunning && !v.upNode {
return v, true
}
}
return &PodFact{}, false
}

// findRunningPod returns the first running pod. If no pods are running, this
// return false.
func (p *PodFacts) findRunningPod() (*PodFact, bool) {
Expand Down
20 changes: 14 additions & 6 deletions pkg/controllers/podfacts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,21 @@ var _ = Describe("podfacts", func() {
It("should verify anyPodsMissingDB return codes", func() {
pf := MakePodFacts(k8sClient, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{dbExists: tristate.True, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.False))
missingDB, unknownState := pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(false))
Expect(unknownState).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p2"}] = &PodFact{dbExists: tristate.None, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.None))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: tristate.False, isPodRunning: true, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.True))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: tristate.True, isPodRunning: true, subcluster: "sc2"}
Expect(pf.anyPodsMissingDB("sc2")).Should(Equal(tristate.False))
missingDB, unknownState = pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(false))
Expect(unknownState).Should(Equal(true))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: tristate.False, subcluster: "sc1"}
missingDB, unknownState = pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(true))
Expect(unknownState).Should(Equal(true))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: tristate.False, subcluster: "sc2"}
missingDB, unknownState = pf.anyPodsMissingDB("sc2")
Expect(missingDB).Should(Equal(true))
Expect(unknownState).Should(Equal(false))
})

It("should verify return of findPodsWithMissingDB", func() {
Expand Down
79 changes: 57 additions & 22 deletions pkg/controllers/restart_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/paths"
"github.com/vertica/vertica-kubernetes/pkg/status"
"github.com/vertica/vertica-kubernetes/pkg/version"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -111,7 +112,10 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result,
return ctrl.Result{Requeue: true}, nil
}

if ok := r.setATPod(); !ok {
// Find an AT pod. You must run with a pod that has no vertica process running.
// This is needed to be able to start the primaries when secondary read-only
// nodes could be running.
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsOffline); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -139,7 +143,7 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result,
return ctrl.Result{}, nil
}

return r.restartCluster(ctx)
return r.restartCluster(ctx, downPods)
}

// reconcileNodes will handle a subset of the pods. It will try to restart any
Expand All @@ -153,7 +157,7 @@ func (r *RestartReconciler) reconcileNodes(ctx context.Context) (ctrl.Result, er
// can't be restarted.
downPods := r.PFacts.findRestartablePods(r.RestartReadOnly, false)
if len(downPods) > 0 {
if ok := r.setATPod(); !ok {
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsAny); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand All @@ -175,7 +179,7 @@ func (r *RestartReconciler) reconcileNodes(ctx context.Context) (ctrl.Result, er
// have been installed but not yet added to a database.
reIPPods := r.PFacts.findReIPPods(true)
if len(reIPPods) > 0 {
if ok := r.setATPod(); !ok {
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsAny); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -345,7 +349,7 @@ func (r *RestartReconciler) reipNodes(ctx context.Context, pods []*PodFact) (ctr
// Prior to calling re_ip, dump out the state of admintools.conf for PD purposes
debugDumpAdmintoolsConf(ctx, r.PRunner, r.ATPod)

cmd = genReIPCommand()
cmd = r.genReIPCommand()
if _, _, err := r.PRunner.ExecAdmintools(ctx, r.ATPod, names.ServerContainer, cmd...); err != nil {
// Log an event as failure to re_ip means we won't be able to bring up the database.
r.VRec.EVRec.Event(r.Vdb, corev1.EventTypeWarning, events.ReipFailed,
Expand All @@ -361,18 +365,8 @@ func (r *RestartReconciler) reipNodes(ctx context.Context, pods []*PodFact) (ctr

// restartCluster will call admintools -t start_db
// It is assumed that the cluster has already run re_ip.
func (r *RestartReconciler) restartCluster(ctx context.Context) (ctrl.Result, error) {
cmd := []string{
"-t", "start_db",
"--database=" + r.Vdb.Spec.DBName,
"--noprompt",
}
if r.Vdb.Spec.IgnoreClusterLease {
cmd = append(cmd, "--ignore-cluster-lease")
}
if r.Vdb.Spec.RestartTimeout != 0 {
cmd = append(cmd, fmt.Sprintf("--timeout=%d", r.Vdb.Spec.RestartTimeout))
}
func (r *RestartReconciler) restartCluster(ctx context.Context, downPods []*PodFact) (ctrl.Result, error) {
cmd := r.genStartDBCommand(downPods)
r.VRec.EVRec.Event(r.Vdb, corev1.EventTypeNormal, events.ClusterRestartStarted,
"Calling 'admintools -t start_db' to restart the cluster")
start := time.Now()
Expand Down Expand Up @@ -536,19 +530,60 @@ func genMapFileUploadCmd(mapFileContents []string) []string {
}

// genReIPCommand will return the command to run for the re_ip command
func genReIPCommand() []string {
return []string{
func (r *RestartReconciler) genReIPCommand() []string {
cmd := []string{
"-t", "re_ip",
"--file=" + AdminToolsMapFile,
"--noprompt",
}

// In 11.1, we added a --force option to re_ip to allow us to run it while
// some nodes are up. This was done to support doing a reip while there are
// read-only secondary nodes.
vinf, ok := version.MakeInfo(r.Vdb)
if ok && vinf.IsEqualOrNewer(version.ReIPAllowedWithUpNodesVersion) {
cmd = append(cmd, "--force")
}

return cmd
}

// genStartDBCommand will return the command for start_db
func (r *RestartReconciler) genStartDBCommand(downPods []*PodFact) []string {
cmd := []string{
"-t", "start_db",
"--database=" + r.Vdb.Spec.DBName,
"--noprompt",
}
if r.Vdb.Spec.IgnoreClusterLease {
cmd = append(cmd, "--ignore-cluster-lease")
}
if r.Vdb.Spec.RestartTimeout != 0 {
cmd = append(cmd, fmt.Sprintf("--timeout=%d", r.Vdb.Spec.RestartTimeout))
}

// In some versions, we can include a list of hosts to start. This
// parameter becomes important for online upgrade as we use this to start
// the primaries while the secondary are in read-only.
vinf, ok := version.MakeInfo(r.Vdb)
if ok && vinf.IsEqualOrNewer(version.StartDBAcceptsHostListVersion) {
hostNames := []string{}
for _, pod := range downPods {
hostNames = append(hostNames, pod.podIP)
}
cmd = append(cmd, "--hosts", strings.Join(hostNames, ","))
}

return cmd
}

// setATPod will set r.ATPod if not already set
func (r *RestartReconciler) setATPod() bool {
// setATPod will set r.ATPod if not already set.
// Caller can indicate whether there is a requirement that it must be run from a
// pod that is current not running the vertica daemon.
func (r *RestartReconciler) setATPod(findFunc func() (*PodFact, bool)) bool {
// If we haven't done so already, figure out the pod to run admintools from.
if r.ATPod == (types.NamespacedName{}) {
atPod, ok := r.PFacts.findPodToRunAdmintools()
atPod, ok := findFunc()
if !ok {
return false
}
Expand Down
37 changes: 33 additions & 4 deletions pkg/controllers/restart_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
vapi "github.com/vertica/vertica-kubernetes/api/v1beta1"
"github.com/vertica/vertica-kubernetes/pkg/cmds"
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/version"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -360,7 +361,7 @@ var _ = Describe("restart_reconciler", func() {
Expect(n2).Should(Equal("DOWN"))
})

It("should do full cluster restart if all up nodes are read-only", func() {
It("should do full cluster restart if none of the nodes are UP and not read-only", func() {
vdb := vapi.MakeVDB()
vdb.Spec.Subclusters[0].Size = 3
vdb.Spec.DBName = "db"
Expand All @@ -375,8 +376,10 @@ var _ = Describe("restart_reconciler", func() {
Expect(pfacts.Collect(ctx, vdb)).Should(Succeed())
for podIndex := int32(0); podIndex < vdb.Spec.Subclusters[0].Size; podIndex++ {
downPodNm := names.GenPodName(vdb, sc, podIndex)
pfacts.Detail[downPodNm].upNode = true
pfacts.Detail[downPodNm].readOnly = true
// At least one pod needs to be totally offline. Cannot have all of them read-only.
pfacts.Detail[downPodNm].upNode = podIndex != 0
pfacts.Detail[downPodNm].readOnly = podIndex != 0
pfacts.Detail[downPodNm].isInstalled = tristate.True
}

r := MakeRestartReconciler(vrec, logger, vdb, fpr, &pfacts, RestartProcessReadOnly)
Expand Down Expand Up @@ -434,7 +437,7 @@ var _ = Describe("restart_reconciler", func() {
act := MakeRestartReconciler(vrec, logger, vdb, fpr, pfacts, RestartProcessReadOnly)
r := act.(*RestartReconciler)
r.ATPod = atPod
Expect(r.restartCluster(ctx)).Should(Equal(ctrl.Result{}))
Expect(r.restartCluster(ctx, []*PodFact{})).Should(Equal(ctrl.Result{}))
restart := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "start_db")
Expect(len(restart)).Should(Equal(1))
Expect(restart[0].Command).Should(ContainElements("--ignore-cluster-lease"))
Expand Down Expand Up @@ -562,4 +565,30 @@ var _ = Describe("restart_reconciler", func() {
restart := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "restart_node")
Expect(len(restart)).Should(Equal(0))
})

It("should use --force option in reip if on version that supports it", func() {
vdb := vapi.MakeVDB()
fpr := &cmds.FakePodRunner{}
act := MakeRestartReconciler(vrec, logger, vdb, fpr, &PodFacts{}, RestartProcessReadOnly)
r := act.(*RestartReconciler)
vdb.Annotations[vapi.VersionAnnotation] = version.MinimumVersion
Expect(r.genReIPCommand()).ShouldNot(ContainElement("--force"))
vdb.Annotations[vapi.VersionAnnotation] = version.ReIPAllowedWithUpNodesVersion
Expect(r.genReIPCommand()).Should(ContainElement("--force"))
})

It("should use --hosts option in start_db if on version that supports it", func() {
vdb := vapi.MakeVDB()
fpr := &cmds.FakePodRunner{}
act := MakeRestartReconciler(vrec, logger, vdb, fpr, &PodFacts{}, RestartProcessReadOnly)
r := act.(*RestartReconciler)
downPods := []*PodFact{
{podIP: "9.10.1.1"},
{podIP: "9.10.1.2"},
}
vdb.Annotations[vapi.VersionAnnotation] = version.MinimumVersion
Expect(r.genStartDBCommand(downPods)).ShouldNot(ContainElement("--hosts"))
vdb.Annotations[vapi.VersionAnnotation] = version.StartDBAcceptsHostListVersion
Expect(r.genStartDBCommand(downPods)).Should(ContainElements("--hosts", "9.10.1.1,9.10.1.2"))
})
})
Loading

0 comments on commit fc5bc85

Please sign in to comment.