Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Online image change: changes for new admintools options in 11.1.0 #142

Merged
merged 5 commits into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions pkg/controllers/dbaddnode_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,17 @@ func (d *DBAddNodeReconciler) Reconcile(ctx context.Context, req *ctrl.Request)
// reconcileSubcluster will reconcile a single subcluster. Add node will be
// triggered if we determine that it hasn't been run.
func (d *DBAddNodeReconciler) reconcileSubcluster(ctx context.Context, sc *vapi.Subcluster) (ctrl.Result, error) {
if exists := d.PFacts.anyPodsMissingDB(sc.Name); exists.IsTrue() {
res, err := d.runAddNode(ctx, sc)
if err != nil || res.Requeue {
return res, err
if missingDB, unknownState := d.PFacts.anyPodsMissingDB(sc.Name); missingDB || unknownState {
if missingDB {
res, err := d.runAddNode(ctx, sc)
if err != nil || res.Requeue {
return res, err
}
}
if unknownState {
d.Log.Info("Requeue because some pods were not running")
}
return ctrl.Result{Requeue: unknownState}, nil
}
return ctrl.Result{}, nil
}
Expand Down
22 changes: 22 additions & 0 deletions pkg/controllers/dbaddnode_reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,26 @@ var _ = Describe("dbaddnode_reconcile", func() {
atCmd = fpr.FindCommands("select rebalance_shards('defaultsubcluster')")
Expect(len(atCmd)).Should(Equal(0))
})

It("should add node and requeue if one pod is missing db and another pod isn't running", func() {
vdb := vapi.MakeVDB()
vdb.Spec.Subclusters[0].Size = 3
createPods(ctx, vdb, AllPodsRunning)
defer deletePods(ctx, vdb)

fpr := &cmds.FakePodRunner{Results: make(cmds.CmdResults)}
pfacts := MakePodFacts(k8sClient, fpr)
Expect(pfacts.Collect(ctx, vdb)).Should(Succeed())
// Make a specific pod as not having a db.
podWithNoDB := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 1)
pfacts.Detail[podWithNoDB].dbExists = tristate.False
pfacts.Detail[podWithNoDB].upNode = false
// Make a specific pod as having an unknown state
podInUnknownState := names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 2)
pfacts.Detail[podInUnknownState].dbExists = tristate.None
r := MakeDBAddNodeReconciler(vrec, logger, vdb, fpr, &pfacts)
Expect(r.Reconcile(ctx, &ctrl.Request{})).Should(Equal(ctrl.Result{Requeue: true}))
lastCall := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "db_add_node")
Expect(len(lastCall)).Should(Equal(1))
})
})
2 changes: 1 addition & 1 deletion pkg/controllers/dbaddsubcluster_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (d *DBAddSubclusterReconciler) Reconcile(ctx context.Context, req *ctrl.Req

// addMissingSubclusters will compare subclusters passed in and create any missing ones
func (d *DBAddSubclusterReconciler) addMissingSubclusters(ctx context.Context, scs []vapi.Subcluster) (ctrl.Result, error) {
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok || !atPod.upNode {
d.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/dbremovenode_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (d *DBRemoveNodeReconciler) removeNodesInSubcluster(ctx context.Context, sc
podsToRemove, requeueNeeded := d.findPodsSuitableForScaleDown(sc, startPodIndex, endPodIndex)
if len(podsToRemove) > 0 {
cmd := d.genCmdRemoveNode(podsToRemove)
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok {
// Requeue since we couldn't find a running pod
d.Log.Info("Requeue since we could not find a pod to run admintools")
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/dbremovesubcluster_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (d *DBRemoveSubclusterReconciler) removeExtraSubclusters(ctx context.Contex
}

if len(subclusters) > 0 {
atPod, ok := d.PFacts.findPodToRunAdmintools()
atPod, ok := d.PFacts.findPodToRunAdmintoolsAny()
if !ok || !atPod.upNode {
d.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/init_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func (g *GenericDatabaseInitializer) checkAndRunInit(ctx context.Context) (ctrl.
// runInit will physically setup the database.
// Depending on g.initializer, this will either do create_db or revive_db.
func (g *GenericDatabaseInitializer) runInit(ctx context.Context) (ctrl.Result, error) {
atPodFact, ok := g.PFacts.findPodToRunAdmintools()
atPodFact, ok := g.PFacts.findPodToRunAdmintoolsOffline()
if !ok {
// Could not find a runable pod to run from.
return ctrl.Result{Requeue: true}, nil
Expand Down
40 changes: 26 additions & 14 deletions pkg/controllers/podfacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,24 +452,25 @@ func (p *PodFacts) doesDBExist() tristate.TriState {
return returnOnFail
}

// anyPodsMissingDB will check whether each pod is added to the database
// It is a tristate return:
// - we return True if at least one runable pod doesn't have a database
// - we return False if all pods have a database
// - we return None if at least one pod we couldn't determine its state
func (p *PodFacts) anyPodsMissingDB(scName string) tristate.TriState {
returnOnFail := tristate.False
// anyPodsMissingDB will check whether each pod is added to the database.
// It returns two states:
// - missingDB is true if at least one pod was running and had a missing DB
// - unknownState is true if at least one pod we could not determine if the DB
// was there or not -- due to the pod not running
func (p *PodFacts) anyPodsMissingDB(scName string) (missingDB, unknownState bool) {
missingDB = false
unknownState = false
for _, v := range p.Detail {
if v.subcluster != scName {
continue
}
if v.dbExists.IsFalse() && v.isPodRunning {
return tristate.True
if v.dbExists.IsFalse() {
missingDB = true
} else if v.dbExists.IsNone() {
returnOnFail = tristate.None
unknownState = true
}
}
return returnOnFail
return
}

// findPodsWithMisstingDB will return a list of pods facts that have a missing DB
Expand Down Expand Up @@ -505,10 +506,10 @@ func (p *PodFacts) findPodToRunVsql() (*PodFact, bool) {
return &PodFact{}, false
}

// findPodToRunAdmintools returns the name of the pod we will exec into into
// order to run admintools
// findPodToRunAdmintoolsAny returns the name of the pod we will exec into into
// order to run admintools.
// Will return false for second parameter if no pod could be found.
func (p *PodFacts) findPodToRunAdmintools() (*PodFact, bool) {
func (p *PodFacts) findPodToRunAdmintoolsAny() (*PodFact, bool) {
// Our preference for the pod is as follows:
// - up and not read-only
// - up and read-only
Expand All @@ -531,6 +532,17 @@ func (p *PodFacts) findPodToRunAdmintools() (*PodFact, bool) {
return &PodFact{}, false
}

// findPodToRunAdmintoolsOffline will return a pod to run an offline admintools
// command. If nothing is found, the second parameter returned will be false.
func (p *PodFacts) findPodToRunAdmintoolsOffline() (*PodFact, bool) {
for _, v := range p.Detail {
if v.isInstalled.IsTrue() && v.isPodRunning && !v.upNode {
return v, true
}
}
return &PodFact{}, false
}

// findRunningPod returns the first running pod. If no pods are running, this
// return false.
func (p *PodFacts) findRunningPod() (*PodFact, bool) {
Expand Down
20 changes: 14 additions & 6 deletions pkg/controllers/podfacts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,21 @@ var _ = Describe("podfacts", func() {
It("should verify anyPodsMissingDB return codes", func() {
pf := MakePodFacts(k8sClient, &cmds.FakePodRunner{})
pf.Detail[types.NamespacedName{Name: "p1"}] = &PodFact{dbExists: tristate.True, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.False))
missingDB, unknownState := pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(false))
Expect(unknownState).Should(Equal(false))
pf.Detail[types.NamespacedName{Name: "p2"}] = &PodFact{dbExists: tristate.None, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.None))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: tristate.False, isPodRunning: true, subcluster: "sc1"}
Expect(pf.anyPodsMissingDB("sc1")).Should(Equal(tristate.True))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: tristate.True, isPodRunning: true, subcluster: "sc2"}
Expect(pf.anyPodsMissingDB("sc2")).Should(Equal(tristate.False))
missingDB, unknownState = pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(false))
Expect(unknownState).Should(Equal(true))
pf.Detail[types.NamespacedName{Name: "p3"}] = &PodFact{dbExists: tristate.False, subcluster: "sc1"}
missingDB, unknownState = pf.anyPodsMissingDB("sc1")
Expect(missingDB).Should(Equal(true))
Expect(unknownState).Should(Equal(true))
pf.Detail[types.NamespacedName{Name: "p4"}] = &PodFact{dbExists: tristate.False, subcluster: "sc2"}
missingDB, unknownState = pf.anyPodsMissingDB("sc2")
Expect(missingDB).Should(Equal(true))
Expect(unknownState).Should(Equal(false))
})

It("should verify return of findPodsWithMissingDB", func() {
Expand Down
79 changes: 57 additions & 22 deletions pkg/controllers/restart_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/paths"
"github.com/vertica/vertica-kubernetes/pkg/status"
"github.com/vertica/vertica-kubernetes/pkg/version"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -111,7 +112,10 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result,
return ctrl.Result{Requeue: true}, nil
}

if ok := r.setATPod(); !ok {
// Find an AT pod. You must run with a pod that has no vertica process running.
// This is needed to be able to start the primaries when secondary read-only
// nodes could be running.
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsOffline); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -139,7 +143,7 @@ func (r *RestartReconciler) reconcileCluster(ctx context.Context) (ctrl.Result,
return ctrl.Result{}, nil
}

return r.restartCluster(ctx)
return r.restartCluster(ctx, downPods)
}

// reconcileNodes will handle a subset of the pods. It will try to restart any
Expand All @@ -153,7 +157,7 @@ func (r *RestartReconciler) reconcileNodes(ctx context.Context) (ctrl.Result, er
// can't be restarted.
downPods := r.PFacts.findRestartablePods(r.RestartReadOnly, false)
if len(downPods) > 0 {
if ok := r.setATPod(); !ok {
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsAny); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand All @@ -175,7 +179,7 @@ func (r *RestartReconciler) reconcileNodes(ctx context.Context) (ctrl.Result, er
// have been installed but not yet added to a database.
reIPPods := r.PFacts.findReIPPods(true)
if len(reIPPods) > 0 {
if ok := r.setATPod(); !ok {
if ok := r.setATPod(r.PFacts.findPodToRunAdmintoolsAny); !ok {
r.Log.Info("No pod found to run admintools from. Requeue reconciliation.")
return ctrl.Result{Requeue: true}, nil
}
Expand Down Expand Up @@ -345,7 +349,7 @@ func (r *RestartReconciler) reipNodes(ctx context.Context, pods []*PodFact) (ctr
// Prior to calling re_ip, dump out the state of admintools.conf for PD purposes
debugDumpAdmintoolsConf(ctx, r.PRunner, r.ATPod)

cmd = genReIPCommand()
cmd = r.genReIPCommand()
if _, _, err := r.PRunner.ExecAdmintools(ctx, r.ATPod, names.ServerContainer, cmd...); err != nil {
// Log an event as failure to re_ip means we won't be able to bring up the database.
r.VRec.EVRec.Event(r.Vdb, corev1.EventTypeWarning, events.ReipFailed,
Expand All @@ -361,18 +365,8 @@ func (r *RestartReconciler) reipNodes(ctx context.Context, pods []*PodFact) (ctr

// restartCluster will call admintools -t start_db
// It is assumed that the cluster has already run re_ip.
func (r *RestartReconciler) restartCluster(ctx context.Context) (ctrl.Result, error) {
cmd := []string{
"-t", "start_db",
"--database=" + r.Vdb.Spec.DBName,
"--noprompt",
}
if r.Vdb.Spec.IgnoreClusterLease {
cmd = append(cmd, "--ignore-cluster-lease")
}
if r.Vdb.Spec.RestartTimeout != 0 {
cmd = append(cmd, fmt.Sprintf("--timeout=%d", r.Vdb.Spec.RestartTimeout))
}
func (r *RestartReconciler) restartCluster(ctx context.Context, downPods []*PodFact) (ctrl.Result, error) {
cmd := r.genStartDBCommand(downPods)
r.VRec.EVRec.Event(r.Vdb, corev1.EventTypeNormal, events.ClusterRestartStarted,
"Calling 'admintools -t start_db' to restart the cluster")
start := time.Now()
Expand Down Expand Up @@ -536,19 +530,60 @@ func genMapFileUploadCmd(mapFileContents []string) []string {
}

// genReIPCommand will return the command to run for the re_ip command
func genReIPCommand() []string {
return []string{
func (r *RestartReconciler) genReIPCommand() []string {
cmd := []string{
"-t", "re_ip",
"--file=" + AdminToolsMapFile,
"--noprompt",
}

// In 11.1, we added a --force option to re_ip to allow us to run it while
// some nodes are up. This was done to support doing a reip while there are
// read-only secondary nodes.
vinf, ok := version.MakeInfo(r.Vdb)
if ok && vinf.IsEqualOrNewer(version.ReIPAllowedWithUpNodesVersion) {
cmd = append(cmd, "--force")
}

return cmd
}

// genStartDBCommand will return the command for start_db
func (r *RestartReconciler) genStartDBCommand(downPods []*PodFact) []string {
cmd := []string{
"-t", "start_db",
"--database=" + r.Vdb.Spec.DBName,
"--noprompt",
}
if r.Vdb.Spec.IgnoreClusterLease {
cmd = append(cmd, "--ignore-cluster-lease")
}
if r.Vdb.Spec.RestartTimeout != 0 {
cmd = append(cmd, fmt.Sprintf("--timeout=%d", r.Vdb.Spec.RestartTimeout))
}

// In some versions, we can include a list of hosts to start. This
// parameter becomes important for online upgrade as we use this to start
// the primaries while the secondary are in read-only.
vinf, ok := version.MakeInfo(r.Vdb)
if ok && vinf.IsEqualOrNewer(version.StartDBAcceptsHostListVersion) {
hostNames := []string{}
for _, pod := range downPods {
hostNames = append(hostNames, pod.podIP)
}
cmd = append(cmd, "--hosts", strings.Join(hostNames, ","))
}

return cmd
}

// setATPod will set r.ATPod if not already set
func (r *RestartReconciler) setATPod() bool {
// setATPod will set r.ATPod if not already set.
// Caller can indicate whether there is a requirement that it must be run from a
// pod that is current not running the vertica daemon.
func (r *RestartReconciler) setATPod(findFunc func() (*PodFact, bool)) bool {
// If we haven't done so already, figure out the pod to run admintools from.
if r.ATPod == (types.NamespacedName{}) {
atPod, ok := r.PFacts.findPodToRunAdmintools()
atPod, ok := findFunc()
if !ok {
return false
}
Expand Down
37 changes: 33 additions & 4 deletions pkg/controllers/restart_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
vapi "github.com/vertica/vertica-kubernetes/api/v1beta1"
"github.com/vertica/vertica-kubernetes/pkg/cmds"
"github.com/vertica/vertica-kubernetes/pkg/names"
"github.com/vertica/vertica-kubernetes/pkg/version"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -360,7 +361,7 @@ var _ = Describe("restart_reconciler", func() {
Expect(n2).Should(Equal("DOWN"))
})

It("should do full cluster restart if all up nodes are read-only", func() {
It("should do full cluster restart if none of the nodes are UP and not read-only", func() {
vdb := vapi.MakeVDB()
vdb.Spec.Subclusters[0].Size = 3
vdb.Spec.DBName = "db"
Expand All @@ -375,8 +376,10 @@ var _ = Describe("restart_reconciler", func() {
Expect(pfacts.Collect(ctx, vdb)).Should(Succeed())
for podIndex := int32(0); podIndex < vdb.Spec.Subclusters[0].Size; podIndex++ {
downPodNm := names.GenPodName(vdb, sc, podIndex)
pfacts.Detail[downPodNm].upNode = true
pfacts.Detail[downPodNm].readOnly = true
// At least one pod needs to be totally offline. Cannot have all of them read-only.
pfacts.Detail[downPodNm].upNode = podIndex != 0
pfacts.Detail[downPodNm].readOnly = podIndex != 0
pfacts.Detail[downPodNm].isInstalled = tristate.True
}

r := MakeRestartReconciler(vrec, logger, vdb, fpr, &pfacts, RestartProcessReadOnly)
Expand Down Expand Up @@ -434,7 +437,7 @@ var _ = Describe("restart_reconciler", func() {
act := MakeRestartReconciler(vrec, logger, vdb, fpr, pfacts, RestartProcessReadOnly)
r := act.(*RestartReconciler)
r.ATPod = atPod
Expect(r.restartCluster(ctx)).Should(Equal(ctrl.Result{}))
Expect(r.restartCluster(ctx, []*PodFact{})).Should(Equal(ctrl.Result{}))
restart := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "start_db")
Expect(len(restart)).Should(Equal(1))
Expect(restart[0].Command).Should(ContainElements("--ignore-cluster-lease"))
Expand Down Expand Up @@ -562,4 +565,30 @@ var _ = Describe("restart_reconciler", func() {
restart := fpr.FindCommands("/opt/vertica/bin/admintools", "-t", "restart_node")
Expect(len(restart)).Should(Equal(0))
})

It("should use --force option in reip if on version that supports it", func() {
vdb := vapi.MakeVDB()
fpr := &cmds.FakePodRunner{}
act := MakeRestartReconciler(vrec, logger, vdb, fpr, &PodFacts{}, RestartProcessReadOnly)
r := act.(*RestartReconciler)
vdb.Annotations[vapi.VersionAnnotation] = version.MinimumVersion
Expect(r.genReIPCommand()).ShouldNot(ContainElement("--force"))
vdb.Annotations[vapi.VersionAnnotation] = version.ReIPAllowedWithUpNodesVersion
Expect(r.genReIPCommand()).Should(ContainElement("--force"))
})

It("should use --hosts option in start_db if on version that supports it", func() {
vdb := vapi.MakeVDB()
fpr := &cmds.FakePodRunner{}
act := MakeRestartReconciler(vrec, logger, vdb, fpr, &PodFacts{}, RestartProcessReadOnly)
r := act.(*RestartReconciler)
downPods := []*PodFact{
{podIP: "9.10.1.1"},
{podIP: "9.10.1.2"},
}
vdb.Annotations[vapi.VersionAnnotation] = version.MinimumVersion
Expect(r.genStartDBCommand(downPods)).ShouldNot(ContainElement("--hosts"))
vdb.Annotations[vapi.VersionAnnotation] = version.StartDBAcceptsHostListVersion
Expect(r.genStartDBCommand(downPods)).Should(ContainElements("--hosts", "9.10.1.1,9.10.1.2"))
})
})
Loading