Skip to content

Commit

Permalink
Fix operator install issues during upgrade in vcluster mode (vertica#517
Browse files Browse the repository at this point in the history
)

When trying to upgrade and scale up at the same time, the operator got
stuck at `OfflineUpgradeReconciler.InstallReconciler` preventing the
operator from moving to the next actors to perform the scale up. To fix
that, we relaxed `InstallReconciler` to not wait for all pods (in
podfacts) to do the install before moving to the next actor.
  • Loading branch information
roypaulin authored Sep 25, 2023
1 parent 7064306 commit 1a8dceb
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 17 deletions.
20 changes: 6 additions & 14 deletions pkg/controllers/vdb/install_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,12 @@ func (d *InstallReconciler) Reconcile(ctx context.Context, _ *ctrl.Request) (ctr
}

// installForVClusterOps will go through the install phase for vclusterOps.
// It only generates the http certs and requeue if at least one pod has not done the install
// It only generates the http certs.
func (d *InstallReconciler) installForVClusterOps(ctx context.Context) (ctrl.Result, error) {
hasUninstalledPods, err := d.generateHTTPCerts(ctx)
err := d.generateHTTPCerts(ctx)
if err != nil {
return ctrl.Result{}, err
}
if hasUninstalledPods {
// We do not proceed to the next actor until
// all pods have done the install
d.Log.Info("Requeue reconcile cycle because not all nodes have done the install for vclusterOps")
return ctrl.Result{Requeue: true}, nil
}
return ctrl.Result{}, nil
}

Expand Down Expand Up @@ -191,8 +185,7 @@ func (d *InstallReconciler) createConfigDirsIfNecessary(ctx context.Context) err

// generateHTTPCerts will generate the necessary config file to be able to start and
// communicate with the Vertica's https server.
func (d *InstallReconciler) generateHTTPCerts(ctx context.Context) (bool, error) {
installedPodCount := 0
func (d *InstallReconciler) generateHTTPCerts(ctx context.Context) error {
for _, p := range d.PFacts.Detail {
if !p.isPodRunning {
continue
Expand All @@ -202,20 +195,19 @@ func (d *InstallReconciler) generateHTTPCerts(ctx context.Context) (bool, error)
secretName := names.GenNamespacedName(d.Vdb, d.Vdb.Spec.HTTPServerTLSSecret)
fname, err := frwt.GenConf(ctx, d.VRec.Client, secretName)
if err != nil {
return false, errors.Wrap(err, fmt.Sprintf("failed generating the %s file", paths.HTTPTLSConfFileName))
return errors.Wrap(err, fmt.Sprintf("failed generating the %s file", paths.HTTPTLSConfFileName))
}
_, _, err = d.PRunner.CopyToPod(ctx, p.name, names.ServerContainer, fname,
fmt.Sprintf("%s/%s", paths.HTTPTLSConfDir, paths.HTTPTLSConfFileName))
_ = os.Remove(fname)
if err != nil {
return false, errors.Wrap(err, fmt.Sprintf("failed to copy %s to the pod %s", fname, p.name))
return errors.Wrap(err, fmt.Sprintf("failed to copy %s to the pod %s", fname, p.name))
}
// Invalidate the pod facts cache since its out of date due the https generation
d.PFacts.Invalidate()
}
installedPodCount++
}
return installedPodCount != len(d.PFacts.Detail), nil
return nil
}

// getInstallTargets finds the list of hosts/pods that we need to initialize the config for
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/vdb/install_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ var _ = Describe("k8s/install_reconcile_test", func() {
pfact := MakePodFacts(vdbRec, fpr)
Expect(pfact.Collect(ctx, vdb)).Should(Succeed())
pfact.Detail[names.GenPodName(vdb, sc, 1)].isPodRunning = false
cmds := reconcileAndFindHTTPTLSConfFileName(ctx, vdb, fpr, &pfact, true)
cmds := reconcileAndFindHTTPTLSConfFileName(ctx, vdb, fpr, &pfact, false)
Expect(len(cmds)).Should(Equal(int(vdb.Spec.Subclusters[0].Size) - 1))
})

Expand Down
2 changes: 1 addition & 1 deletion scripts/setup-kustomize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -750,4 +750,4 @@ done
for tdir in e2e-leg-4/pvc-expansion/verify-pvc-change
do
create_volume_expansion_overlay $tdir volume-expansion-enabled volume-expansion-disabled
done
done
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ spec:
size: 2
# Set requeueTime since we are intentionally failing. This prevents the
# exponential backoff kicking in, which can cause the test to timeout.
requeueTime: 5
requeueTime: 5

0 comments on commit 1a8dceb

Please sign in to comment.