diff --git a/api/v1beta1/verticadb_types.go b/api/v1beta1/verticadb_types.go index 02b147db9..e7cf8905f 100644 --- a/api/v1beta1/verticadb_types.go +++ b/api/v1beta1/verticadb_types.go @@ -146,14 +146,6 @@ type VerticaDBSpec struct { // running with a Vertica version that supports read-only subclusters. ImageChangePolicy ImageChangePolicyType `json:"imageChangePolicy"` - // +kubebuilder:validation:Optional - // +operator-sdk:csv:customresourcedefinitions:type=spec,xDescriptors="urn:alm:descriptor:com.tectonic.ui:hidden" - // When doing an online image change, we utilize a transient subcluster to - // serve traffic while one of the other subclusters restart. This is the - // size of that subcluster. This subcluster is created at the beginning of - // the online image change and is removed when that process cleans up. - TransientSubclusterSize int `json:"transientSubclusterSize,omitempty"` - // +kubebuilder:validation:Optional // +operator-sdk:csv:customresourcedefinitions:type=spec,xDescriptors={"urn:alm:descriptor:com.tectonic.ui:fieldDependency:initPolicy:Revive","urn:alm:descriptor:com.tectonic.ui:advanced"} // This specifies the order of nodes when doing a revive. Each entry @@ -199,6 +191,15 @@ type VerticaDBSpec struct { //+operator-sdk:csv:customresourcedefinitions:type=spec Subclusters []Subcluster `json:"subclusters"` + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec,xDescriptors="urn:alm:descriptor:com.tectonic.ui:hidden" + // When doing an online image change, we utilize a transient subcluster to + // serve traffic while one of the other subclusters restart. This is the + // template to create that subcluster. This subcluster is created at the + // beginning of the online image change and is removed when that process + // cleans up. + TransientSubclusterTemplate Subcluster `json:"transientSubclusterTemplate,omitempty"` + // +kubebuilder:default:="1" // +kubebuilder:validation:Optional // +operator-sdk:csv:customresourcedefinitions:type=spec,xDescriptors={"urn:alm:descriptor:com.tectonic.ui:select:0","urn:alm:descriptor:com.tectonic.ui:select:1","urn:alm:descriptor:com.tectonic.ui:advanced"} diff --git a/pkg/controllers/builder.go b/pkg/controllers/builder.go index fbd9e694c..b73012d7c 100644 --- a/pkg/controllers/builder.go +++ b/pkg/controllers/builder.go @@ -29,8 +29,8 @@ import ( ) const ( - SuperuserPasswordPath = "superuser-passwd" - TransientSubclusterName = "transient" + SuperuserPasswordPath = "superuser-passwd" + DefaultTransientSubclusterName = "transient" ) // buildExtSvc creates desired spec for the external service. @@ -586,21 +586,37 @@ func getK8sAffinity(a vapi.Affinity) *corev1.Affinity { // buildTransientSubcluster creates a temporary read-only subcluster based on an // existing subcluster -func buildTransientSubcluster(sc *vapi.Subcluster, imageOverride string) *vapi.Subcluster { +func buildTransientSubcluster(vdb *vapi.VerticaDB, sc *vapi.Subcluster, imageOverride string) *vapi.Subcluster { return &vapi.Subcluster{ - Name: TransientSubclusterName, - Size: 1, + Name: transientSubclusterName(vdb), + Size: transientSubclusterSize(vdb), IsTransient: true, ImageOverride: imageOverride, IsPrimary: false, - NodeSelector: sc.NodeSelector, - Affinity: sc.Affinity, - PriorityClassName: sc.PriorityClassName, - Tolerations: sc.Tolerations, - Resources: sc.Resources, + NodeSelector: vdb.Spec.TransientSubclusterTemplate.NodeSelector, + Affinity: vdb.Spec.TransientSubclusterTemplate.Affinity, + PriorityClassName: vdb.Spec.TransientSubclusterTemplate.PriorityClassName, + Tolerations: vdb.Spec.TransientSubclusterTemplate.Tolerations, + Resources: vdb.Spec.TransientSubclusterTemplate.Resources, ServiceType: sc.ServiceType, ServiceName: sc.GetServiceName(), NodePort: sc.NodePort, ExternalIPs: sc.ExternalIPs, } } + +// transientSuclusterName returns the name of the transient subcluster +func transientSubclusterName(vdb *vapi.VerticaDB) string { + if vdb.Spec.TransientSubclusterTemplate.Name == "" { + return DefaultTransientSubclusterName + } + return vdb.Spec.TransientSubclusterTemplate.Name +} + +// transientSubclusterSize returns the size of the transient subcluster. +func transientSubclusterSize(vdb *vapi.VerticaDB) int32 { + if vdb.Spec.TransientSubclusterTemplate.Size > 0 { + return vdb.Spec.TransientSubclusterTemplate.Size + } + return 1 +} diff --git a/pkg/controllers/imagechange.go b/pkg/controllers/imagechange.go index dbf48806b..7728b2af2 100644 --- a/pkg/controllers/imagechange.go +++ b/pkg/controllers/imagechange.go @@ -161,8 +161,8 @@ func (i *ImageChangeManager) setImageChangeStatus(ctx context.Context, msg strin } // updateImageInStatefulSets will change the image in each of the statefulsets. -// Caller can indicate whether primary or secondary types change. -func (i *ImageChangeManager) updateImageInStatefulSets(ctx context.Context, chgPrimary, chgSecondary bool) (int, ctrl.Result, error) { +// This changes the images in all subclusters except any transient ones. +func (i *ImageChangeManager) updateImageInStatefulSets(ctx context.Context) (int, ctrl.Result, error) { numStsChanged := 0 // Count to keep track of the nubmer of statefulsets updated // We use FindExisting for the finder because we only want to work with sts @@ -176,12 +176,6 @@ func (i *ImageChangeManager) updateImageInStatefulSets(ctx context.Context, chgP for inx := range stss.Items { sts := &stss.Items[inx] - if !chgPrimary && sts.Labels[SubclusterTypeLabel] == vapi.PrimarySubclusterType { - continue - } - if !chgSecondary && sts.Labels[SubclusterTypeLabel] == vapi.SecondarySubclusterType { - continue - } isTransient, err := strconv.ParseBool(sts.Labels[SubclusterTransientLabel]) if err != nil { return numStsChanged, ctrl.Result{}, err @@ -190,33 +184,44 @@ func (i *ImageChangeManager) updateImageInStatefulSets(ctx context.Context, chgP continue } - // Skip the statefulset if it already has the proper image. - if sts.Spec.Template.Spec.Containers[names.ServerContainerIndex].Image != i.Vdb.Spec.Image { - i.Log.Info("Updating image in old statefulset", "name", sts.ObjectMeta.Name) + if stsUpdated, err := i.updateImageInStatefulSet(ctx, sts); err != nil { + return numStsChanged, ctrl.Result{}, err + } else if stsUpdated { err = i.setImageChangeStatus(ctx, "Rescheduling pods with new image name") if err != nil { return numStsChanged, ctrl.Result{}, err } - sts.Spec.Template.Spec.Containers[names.ServerContainerIndex].Image = i.Vdb.Spec.Image - // We change the update strategy to OnDelete. We don't want the k8s - // sts controller to interphere and do a rolling update after the - // update has completed. We don't explicitly change this back. The - // ObjReconciler will handle it for us. - sts.Spec.UpdateStrategy.Type = appsv1.OnDeleteStatefulSetStrategyType - err = i.VRec.Client.Update(ctx, sts) - if err != nil { - return numStsChanged, ctrl.Result{}, err - } numStsChanged++ } } return numStsChanged, ctrl.Result{}, nil } +// updateImageInStatefulSet will update the image in the given statefulset. It +// returns true if the image was changed. +func (i *ImageChangeManager) updateImageInStatefulSet(ctx context.Context, sts *appsv1.StatefulSet) (bool, error) { + stsUpdated := false + // Skip the statefulset if it already has the proper image. + if sts.Spec.Template.Spec.Containers[names.ServerContainerIndex].Image != i.Vdb.Spec.Image { + i.Log.Info("Updating image in old statefulset", "name", sts.ObjectMeta.Name) + sts.Spec.Template.Spec.Containers[names.ServerContainerIndex].Image = i.Vdb.Spec.Image + // We change the update strategy to OnDelete. We don't want the k8s + // sts controller to interphere and do a rolling update after the + // update has completed. We don't explicitly change this back. The + // ObjReconciler will handle it for us. + sts.Spec.UpdateStrategy.Type = appsv1.OnDeleteStatefulSetStrategyType + if err := i.VRec.Client.Update(ctx, sts); err != nil { + return false, err + } + stsUpdated = true + } + return stsUpdated, nil +} + // deletePodsRunningOldImage will delete pods that have the old image. It will return the // number of pods that were deleted. Callers can control whether to delete pods -// just for the primary or primary/secondary. -func (i *ImageChangeManager) deletePodsRunningOldImage(ctx context.Context, delSecondary bool) (int, ctrl.Result, error) { +// for a specific subcluster or all -- passing an empty string for scName will delete all. +func (i *ImageChangeManager) deletePodsRunningOldImage(ctx context.Context, scName string) (int, error) { numPodsDeleted := 0 // Tracks the number of pods that were deleted // We use FindExisting for the finder because we only want to work with pods @@ -225,16 +230,15 @@ func (i *ImageChangeManager) deletePodsRunningOldImage(ctx context.Context, delS // doesn't take affect until after the image change. pods, err := i.Finder.FindPods(ctx, FindExisting) if err != nil { - return numPodsDeleted, ctrl.Result{}, err + return numPodsDeleted, err } for inx := range pods.Items { pod := &pods.Items[inx] - // We aren't deleting secondary pods, so we only continue if the pod is - // for a primary - if !delSecondary { - scType, ok := pod.Labels[SubclusterTypeLabel] - if ok && scType != vapi.PrimarySubclusterType { + // If scName was passed in, we only delete for a specific subcluster + if scName != "" { + scNameFromLabel, ok := pod.Labels[SubclusterNameLabel] + if ok && scNameFromLabel != scName { continue } } @@ -244,12 +248,12 @@ func (i *ImageChangeManager) deletePodsRunningOldImage(ctx context.Context, delS i.Log.Info("Deleting pod that had old image", "name", pod.ObjectMeta.Name) err = i.VRec.Client.Delete(ctx, pod) if err != nil { - return numPodsDeleted, ctrl.Result{}, err + return numPodsDeleted, err } numPodsDeleted++ } } - return numPodsDeleted, ctrl.Result{}, nil + return numPodsDeleted, nil } // onlineImageChangeAllowed returns true if image change must be done online diff --git a/pkg/controllers/imagechange_test.go b/pkg/controllers/imagechange_test.go index bafa76db8..cec1f93f4 100644 --- a/pkg/controllers/imagechange_test.go +++ b/pkg/controllers/imagechange_test.go @@ -30,6 +30,8 @@ import ( var _ = Describe("imagechange", func() { ctx := context.Background() + const OldImage = "old-image" + const NewImage = "new-image-1" It("should correctly pick the image change type", func() { vdb := vapi.MakeVDB() @@ -76,10 +78,7 @@ var _ = Describe("imagechange", func() { Expect(mgr.IsImageChangeNeeded(ctx)).Should(Equal(false)) }) - It("should change the image of just the primaries or just secondaries", func() { - const OldImage = "old-image" - const NewImage1 = "new-image-1" - const NewImage2 = "new-image-2" + It("should change the image of both primaries and secondaries", func() { vdb := vapi.MakeVDB() vdb.Spec.Image = OldImage vdb.Spec.Subclusters = []vapi.Subcluster{ @@ -88,64 +87,70 @@ var _ = Describe("imagechange", func() { } createPods(ctx, vdb, AllPodsRunning) defer deletePods(ctx, vdb) - vdb.Spec.Image = NewImage1 + vdb.Spec.Image = NewImage createVdb(ctx, vdb) defer deleteVdb(ctx, vdb) mgr := MakeImageChangeManager(vrec, logger, vdb, vapi.OfflineImageChangeInProgress, func(vdb *vapi.VerticaDB) bool { return true }) Expect(mgr.IsImageChangeNeeded(ctx)).Should(Equal(true)) - stsChange, res, err := mgr.updateImageInStatefulSets(ctx, true, false) + stsChange, res, err := mgr.updateImageInStatefulSets(ctx) Expect(err).Should(Succeed()) Expect(res).Should(Equal(ctrl.Result{})) - Expect(stsChange).Should(Equal(1)) + Expect(stsChange).Should(Equal(2)) sts := &appsv1.StatefulSet{} Expect(k8sClient.Get(ctx, names.GenStsName(vdb, &vdb.Spec.Subclusters[0]), sts)).Should(Succeed()) - Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(NewImage1)) + Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(NewImage)) Expect(k8sClient.Get(ctx, names.GenStsName(vdb, &vdb.Spec.Subclusters[1]), sts)).Should(Succeed()) - Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(OldImage)) + Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(NewImage)) + }) - vdb.Spec.Image = NewImage2 - Expect(k8sClient.Update(ctx, vdb)).Should(Succeed()) + It("should delete pods of all subclusters", func() { + vdb := vapi.MakeVDB() + vdb.Spec.Subclusters = []vapi.Subcluster{ + {Name: "sc1", Size: 1, IsPrimary: true}, + {Name: "sc2", Size: 1, IsPrimary: false}, + } + createPods(ctx, vdb, AllPodsRunning) + defer deletePods(ctx, vdb) + vdb.Spec.Image = NewImage // Change image to force pod deletion - stsChange, res, err = mgr.updateImageInStatefulSets(ctx, false, true) + mgr := MakeImageChangeManager(vrec, logger, vdb, vapi.OfflineImageChangeInProgress, + func(vdb *vapi.VerticaDB) bool { return true }) + numPodsDeleted, err := mgr.deletePodsRunningOldImage(ctx, "") // pods from primaries only Expect(err).Should(Succeed()) - Expect(res).Should(Equal(ctrl.Result{})) - Expect(stsChange).Should(Equal(1)) + Expect(numPodsDeleted).Should(Equal(2)) - Expect(k8sClient.Get(ctx, names.GenStsName(vdb, &vdb.Spec.Subclusters[0]), sts)).Should(Succeed()) - Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(NewImage1)) - Expect(k8sClient.Get(ctx, names.GenStsName(vdb, &vdb.Spec.Subclusters[1]), sts)).Should(Succeed()) - Expect(sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image).Should(Equal(NewImage2)) + pod := &corev1.Pod{} + Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0), pod)).ShouldNot(Succeed()) + Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[1], 0), pod)).ShouldNot(Succeed()) }) - It("should delete pods of primaries only", func() { + It("should delete pods of specific subcluster", func() { vdb := vapi.MakeVDB() vdb.Spec.Subclusters = []vapi.Subcluster{ - {Name: "sc1", Size: 1, IsPrimary: true}, + {Name: "sc1", Size: 1, IsPrimary: false}, {Name: "sc2", Size: 1, IsPrimary: false}, } createPods(ctx, vdb, AllPodsRunning) defer deletePods(ctx, vdb) - vdb.Spec.Image = "new-image" // Change image to force pod deletion + vdb.Spec.Image = NewImage // Change image to force pod deletion mgr := MakeImageChangeManager(vrec, logger, vdb, vapi.OfflineImageChangeInProgress, func(vdb *vapi.VerticaDB) bool { return true }) - numPodsDeleted, res, err := mgr.deletePodsRunningOldImage(ctx, false) // pods from primaries only + numPodsDeleted, err := mgr.deletePodsRunningOldImage(ctx, vdb.Spec.Subclusters[1].Name) Expect(err).Should(Succeed()) - Expect(res).Should(Equal(ctrl.Result{})) Expect(numPodsDeleted).Should(Equal(1)) pod := &corev1.Pod{} - Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0), pod)).ShouldNot(Succeed()) - Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[1], 0), pod)).Should(Succeed()) + Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0), pod)).Should(Succeed()) + Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[1], 0), pod)).ShouldNot(Succeed()) - numPodsDeleted, res, err = mgr.deletePodsRunningOldImage(ctx, true) // pods from secondary and primaries + numPodsDeleted, err = mgr.deletePodsRunningOldImage(ctx, vdb.Spec.Subclusters[0].Name) Expect(err).Should(Succeed()) - Expect(res).Should(Equal(ctrl.Result{})) Expect(numPodsDeleted).Should(Equal(1)) - Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[1], 0), pod)).ShouldNot(Succeed()) + Expect(k8sClient.Get(ctx, names.GenPodName(vdb, &vdb.Spec.Subclusters[0], 0), pod)).ShouldNot(Succeed()) }) }) diff --git a/pkg/controllers/labels_annotations.go b/pkg/controllers/labels_annotations.go index 895cf054f..24755dc02 100644 --- a/pkg/controllers/labels_annotations.go +++ b/pkg/controllers/labels_annotations.go @@ -34,12 +34,18 @@ const ( // makeSubclusterLabels returns the labels added for the subcluster func makeSubclusterLabels(sc *vapi.Subcluster) map[string]string { - return map[string]string{ + m := map[string]string{ SubclusterNameLabel: sc.Name, SubclusterTypeLabel: sc.GetType(), - SubclusterSvcNameLabel: sc.GetServiceName(), SubclusterTransientLabel: strconv.FormatBool(sc.IsTransient), } + // Transient subclusters never have the service name label set. At various + // parts of the image change, it will accept traffic from all of the + // subclusters. + if !sc.IsTransient { + m[SubclusterSvcNameLabel] = sc.GetServiceName() + } + return m } // makeOperatorLabels returns the labels that all objects created by this operator will have @@ -109,10 +115,13 @@ func makeSvcSelectorLabels(vdb *vapi.VerticaDB, sc *vapi.Subcluster) map[string] VDBInstanceLabel: vdb.Name, } if sc != nil { - m[SubclusterSvcNameLabel] = sc.GetServiceName() - // This label is here to ensure service object routes to - // primary/secondary or the transient, but never both - m[SubclusterTransientLabel] = strconv.FormatBool(sc.IsTransient) + if sc.IsTransient { + // This label is here to ensure service object routes to + // primary/secondary or the transient, but never both + m[SubclusterTransientLabel] = strconv.FormatBool(sc.IsTransient) + } else { + m[SubclusterSvcNameLabel] = sc.GetServiceName() + } } return m } diff --git a/pkg/controllers/obj_reconcile_test.go b/pkg/controllers/obj_reconcile_test.go index 0a550b374..ed68bb596 100644 --- a/pkg/controllers/obj_reconcile_test.go +++ b/pkg/controllers/obj_reconcile_test.go @@ -565,7 +565,7 @@ var _ = Describe("obj_reconcile", func() { svc1 := &corev1.Service{} Expect(k8sClient.Get(ctx, nm, svc1)).Should(Succeed()) - standby := buildTransientSubcluster(sc, "") + standby := buildTransientSubcluster(vdb, sc, "") pfacts := MakePodFacts(k8sClient, &cmds.FakePodRunner{}) actor := MakeObjReconciler(vrec, logger, vdb, &pfacts) objr := actor.(*ObjReconciler) diff --git a/pkg/controllers/offlineimagechange_reconcile.go b/pkg/controllers/offlineimagechange_reconcile.go index 40af3d853..0fda8169f 100644 --- a/pkg/controllers/offlineimagechange_reconcile.go +++ b/pkg/controllers/offlineimagechange_reconcile.go @@ -139,7 +139,7 @@ func (o *OfflineImageChangeReconciler) stopCluster(ctx context.Context) (ctrl.Re // Since there will be processing after to delete the pods so that they come up // with the new image. func (o *OfflineImageChangeReconciler) updateImageInStatefulSets(ctx context.Context) (ctrl.Result, error) { - numStsChanged, res, err := o.Manager.updateImageInStatefulSets(ctx, true, true) + numStsChanged, res, err := o.Manager.updateImageInStatefulSets(ctx) if numStsChanged > 0 { o.PFacts.Invalidate() } @@ -151,11 +151,11 @@ func (o *OfflineImageChangeReconciler) updateImageInStatefulSets(ctx context.Con // the sts is OnDelete. Deleting the pods ensures they get rescheduled with the // new image. func (o *OfflineImageChangeReconciler) deletePods(ctx context.Context) (ctrl.Result, error) { - numPodsDeleted, res, err := o.Manager.deletePodsRunningOldImage(ctx, true) + numPodsDeleted, err := o.Manager.deletePodsRunningOldImage(ctx, "") if numPodsDeleted > 0 { o.PFacts.Invalidate() } - return res, err + return ctrl.Result{}, err } // checkForNewPods will check to ensure at least one pod exists with the new image. diff --git a/pkg/controllers/onlineimagechange_reconcile_test.go b/pkg/controllers/onlineimagechange_reconcile_test.go index 383ba1978..9346c51de 100644 --- a/pkg/controllers/onlineimagechange_reconcile_test.go +++ b/pkg/controllers/onlineimagechange_reconcile_test.go @@ -71,7 +71,7 @@ var _ = Describe("onlineimagechange_reconcile", func() { fscs := fetchVdb.Spec.Subclusters Expect(len(fscs)).Should(Equal(4)) // orig + 1 transient - Expect(fscs[3].Name).Should(Equal(TransientSubclusterName)) + Expect(fscs[3].Name).Should(Equal(DefaultTransientSubclusterName)) Expect(r.loadSubclusterState(ctx)).Should(Equal(ctrl.Result{})) // Collect state again for new pods/sts diff --git a/pkg/controllers/onlineimagechange_reconciler.go b/pkg/controllers/onlineimagechange_reconciler.go index 4f86bef09..04f07a1eb 100644 --- a/pkg/controllers/onlineimagechange_reconciler.go +++ b/pkg/controllers/onlineimagechange_reconciler.go @@ -18,10 +18,12 @@ package controllers import ( "context" "fmt" + "strconv" "github.com/go-logr/logr" vapi "github.com/vertica/vertica-kubernetes/api/v1beta1" "github.com/vertica/vertica-kubernetes/pkg/cmds" + appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" ctrl "sigs.k8s.io/controller-runtime" @@ -68,28 +70,14 @@ func (o *OnlineImageChangeReconciler) Reconcile(ctx context.Context, req *ctrl.R o.installTransientNodes, o.addTransientSubcluster, o.addTransientNodes, - // Reroute all traffic from primary subclusters to the transient - o.rerouteClientTrafficToTransient, - // Drain all connections from the primary subcluster. This waits for - // connections that were established before traffic was routed to the - // transient. - o.drainPrimaries, - // Change the image in each of the primary subclusters. - o.changeImageInPrimaries, - // Restart the pods of the primary subclusters. + // Handle restart of the primary subclusters o.restartPrimaries, - // Reroute all traffic from transient subcluster back to the primaries - o.rerouteClientTrafficToPrimaries, - // Drain all connections from the transient subcluster to prepare it - // for being removed. - o.drainTransient, + // Handle restart of secondary subclusters + o.restartSecondaries, // Will cleanup the transient subcluster now that the primaries are back up. o.removeTransientSubclusters, o.uninstallTransientNodes, o.deleteTransientSts, - // With the primaries back up, we can do a "rolling upgrade" style of - // update for the secondary subclusters. - o.startRollingUpgradeOfSecondarySubclusters, // Cleanup up the condition and event recording for a completed image change o.Manager.finishImageChange, } @@ -168,71 +156,132 @@ func (o *OnlineImageChangeReconciler) addTransientNodes(ctx context.Context) (ct return actor.Reconcile(ctx, &ctrl.Request{}) } -// rerouteClientTrafficToTransient will update the service objects for each of the -// primary subclusters so that they are routed to the transient subcluster. -func (o *OnlineImageChangeReconciler) rerouteClientTrafficToTransient(ctx context.Context) (ctrl.Result, error) { - if o.skipTransientSetup() { - return ctrl.Result{}, nil +// restartPrimaries will handle the image change on all of the primaries. +func (o *OnlineImageChangeReconciler) restartPrimaries(ctx context.Context) (ctrl.Result, error) { + stss, err := o.Finder.FindStatefulSets(ctx, FindExisting) + if err != nil { + return ctrl.Result{}, err } - o.Log.Info("starting client traffic routing to transient") - err := o.routeClientTraffic(ctx, func(sc *vapi.Subcluster) bool { return sc.IsTransient }) - return ctrl.Result{}, err + // We bring all primaries offline before we start to bring any of them back up. + primaries := []*appsv1.StatefulSet{} + + for i := range stss.Items { + sts := &stss.Items[i] + matches := false + if matches, err = o.isMatchingSubclusterType(sts, vapi.PrimarySubclusterType); err != nil { + return ctrl.Result{}, err + } else if !matches { + continue + } + primaries = append(primaries, sts) + + err = o.takeSubclusterOffline(ctx, sts) + if err != nil { + return ctrl.Result{}, err + } + } + + for i := range primaries { + res, err := o.bringSubclusterOnline(ctx, primaries[i]) + if res.Requeue || err != nil { + return res, err + } + } + + return ctrl.Result{}, nil } -// drainPrimaries will only succeed if the primary subclusters are already down -// or have no active connections. All traffic to the primaries get routed to -// the transient subcluster, so this step waits for any connection that were -// established before the transient was created. -func (o *OnlineImageChangeReconciler) drainPrimaries(ctx context.Context) (ctrl.Result, error) { +// restartSecondaries will restart all of the secondaries, temporarily +// rerouting traffic to the transient while it does the restart. +func (o *OnlineImageChangeReconciler) restartSecondaries(ctx context.Context) (ctrl.Result, error) { + stss, err := o.Finder.FindStatefulSets(ctx, FindExisting) + if err != nil { + return ctrl.Result{}, err + } + // We do each subcluster at a time for secondary. We can do this + // differently than primaries because the secondaries aren't needed to form + // the cluster, so it can be done in a piece meal fashion. + for i := range stss.Items { + sts := &stss.Items[i] + if matches, err := o.isMatchingSubclusterType(sts, vapi.SecondarySubclusterType); err != nil { + return ctrl.Result{}, err + } else if !matches { + continue + } + + err := o.takeSubclusterOffline(ctx, sts) + if err != nil { + return ctrl.Result{}, err + } + res, err := o.bringSubclusterOnline(ctx, sts) + if res.Requeue || err != nil { + return res, err + } + } return ctrl.Result{}, nil } -// changeImageInPrimaries will update the statefulset of each of the primary -// subcluster's with the new image. It will also force the cluster in read-only -// mode as all of the pods in the primary will be rescheduled with the new -// image. -func (o *OnlineImageChangeReconciler) changeImageInPrimaries(ctx context.Context) (ctrl.Result, error) { - numStsChanged, res, err := o.Manager.updateImageInStatefulSets(ctx, true, false) - if numStsChanged > 0 { - o.Log.Info("changed image in statefulsets", "num", numStsChanged) - o.PFacts.Invalidate() +// isMatchingSubclusterType will return true if the subcluster type matches the +// input string. Always returns false for the transient subcluster. +func (o *OnlineImageChangeReconciler) isMatchingSubclusterType(sts *appsv1.StatefulSet, scType string) (bool, error) { + isTransient, err := strconv.ParseBool(sts.Labels[SubclusterTransientLabel]) + if err != nil { + return false, fmt.Errorf("could not parse label %s: %w", SubclusterTransientLabel, err) } - return res, err + return sts.Labels[SubclusterTypeLabel] == scType && isTransient, nil } -// restartPrimaries will restart all of the pods in the primary subclusters. -func (o *OnlineImageChangeReconciler) restartPrimaries(ctx context.Context) (ctrl.Result, error) { - numPodsDeleted, res, err := o.Manager.deletePodsRunningOldImage(ctx, false) - if res.Requeue || err != nil { - return res, err +// takeSubclusterOffline will take bring down a subcluster if it running the old +// image. It will reroute client traffic to the transient subcluster so that +// access stays online during the image change. +func (o *OnlineImageChangeReconciler) takeSubclusterOffline(ctx context.Context, sts *appsv1.StatefulSet) error { + var err error + scName := sts.Labels[SubclusterNameLabel] + img := sts.Spec.Template.Spec.Containers[ServerContainerIndex].Image + + if img != o.Vdb.Spec.Image { + o.Log.Info("starting client traffic routing of secondary to transient", "name", scName) + err = o.routeClientTraffic(ctx, func(sc *vapi.Subcluster) bool { return sc.Name == scName }, true) + if err != nil { + return err + } + } + + stsChanged, err := o.Manager.updateImageInStatefulSet(ctx, sts) + if err != nil { + return err + } + if stsChanged { + o.PFacts.Invalidate() + } + + podsDeleted, err := o.Manager.deletePodsRunningOldImage(ctx, scName) + if err != nil { + return err } - if numPodsDeleted > 0 { - o.Log.Info("deleted pods running old image", "num", numPodsDeleted) + if podsDeleted > 0 { o.PFacts.Invalidate() } + return nil +} +// bringSubclusterOnline will bring up a subcluster and reroute traffic back to the subcluster. +func (o *OnlineImageChangeReconciler) bringSubclusterOnline(ctx context.Context, sts *appsv1.StatefulSet) (ctrl.Result, error) { const DoNotRestartReadOnly = false actor := MakeRestartReconciler(o.VRec, o.Log, o.Vdb, o.PRunner, o.PFacts, DoNotRestartReadOnly) o.traceActorReconcile(actor) - return actor.Reconcile(ctx, &ctrl.Request{}) -} + res, err := actor.Reconcile(ctx, &ctrl.Request{}) + if res.Requeue || err != nil { + return res, err + } -// rerouteClientTrafficToPrimaries will update the service objects of the primary -// subclusters so that traffic is not routed to the transient anymore but back -// to the primary subclusters. -func (o *OnlineImageChangeReconciler) rerouteClientTrafficToPrimaries(ctx context.Context) (ctrl.Result, error) { - o.Log.Info("starting client traffic routing to primary") - err := o.routeClientTraffic(ctx, func(sc *vapi.Subcluster) bool { return sc.IsPrimary }) + scName := sts.Labels[SubclusterNameLabel] + o.Log.Info("starting client traffic routing back to subcluster", "name", scName) + err = o.routeClientTraffic(ctx, func(sc *vapi.Subcluster) bool { return sc.Name == scName }, false) return ctrl.Result{}, err } -// drainTransient will wait for all active connections in the transient subcluster -// to leave. This is preparation for eventual removal of the transient subcluster. -func (o *OnlineImageChangeReconciler) drainTransient(ctx context.Context) (ctrl.Result, error) { - return ctrl.Result{}, nil -} - // removeTransientSubclusters will drive subcluster removal of the transient subcluster func (o *OnlineImageChangeReconciler) removeTransientSubclusters(ctx context.Context) (ctrl.Result, error) { actor := MakeDBRemoveSubclusterReconciler(o.VRec, o.Log, o.Vdb, o.PRunner, o.PFacts) @@ -258,14 +307,6 @@ func (o *OnlineImageChangeReconciler) deleteTransientSts(ctx context.Context) (c return actor.Reconcile(ctx, &ctrl.Request{}) } -// startRollingUpgradeOfSecondarySubclusters will update the image of each of -// the secondary subclusters. The update policy will be rolling upgrade. This -// gives control of restarting each pod back to k8s. This can be done because -// secondary subclusters don't participate in cluster quorum. -func (o *OnlineImageChangeReconciler) startRollingUpgradeOfSecondarySubclusters(ctx context.Context) (ctrl.Result, error) { - return ctrl.Result{}, nil -} - // cachePrimaryImages will update o.PrimaryImages with the names of all of the primary images func (o *OnlineImageChangeReconciler) cachePrimaryImages(ctx context.Context) error { stss, err := o.Finder.FindStatefulSets(ctx, FindExisting) @@ -333,7 +374,7 @@ func (o *OnlineImageChangeReconciler) addTransientToVdb(ctx context.Context) err for i := range o.Vdb.Spec.Subclusters { sc := &o.Vdb.Spec.Subclusters[i] if sc.IsPrimary { - transient := buildTransientSubcluster(sc, oldImage) + transient := buildTransientSubcluster(o.Vdb, sc, oldImage) _, ok := scMap[transient.Name] if !ok { if err := o.Manager.setImageChangeStatus(ctx, "Creating transient subcluster"); err != nil { @@ -379,17 +420,25 @@ func (o *OnlineImageChangeReconciler) traceActorReconcile(actor ReconcileActor) } // routeClientTraffic will update service objects to route to either the primary -// or standby. The subcluster picked is determined by the scCheckFunc the +// or transient. The subcluster picked is determined by the scCheckFunc the // caller provides. If it returns true for a given subcluster, traffic will be // routed to that. -func (o *OnlineImageChangeReconciler) routeClientTraffic(ctx context.Context, scCheckFunc func(sc *vapi.Subcluster) bool) error { +func (o *OnlineImageChangeReconciler) routeClientTraffic(ctx context.Context, + scSelectorFunc func(sc *vapi.Subcluster) bool, useTransientSc bool) error { actor := MakeObjReconciler(o.VRec, o.Log, o.Vdb, o.PFacts) objRec := actor.(*ObjReconciler) + // We update the external service object to route traffic to transient or + // primary/secondary. We make a copy of the subcluster than modify it + // in-place with the IsTransient state to route to transient subcluster or + // not. for i := range o.Vdb.Spec.Subclusters { - sc := &o.Vdb.Spec.Subclusters[i] - if scCheckFunc(sc) { - if err := objRec.reconcileExtSvc(ctx, sc); err != nil { + sc := o.Vdb.Spec.Subclusters[i] + if scSelectorFunc(&sc) { + // We are modifying a copy of sc, so we flip the IsTransient flag to + // know what subcluster we are going to route to. + sc.IsTransient = useTransientSc + if err := objRec.reconcileExtSvc(ctx, &sc); err != nil { return err } } diff --git a/pkg/controllers/podfacts.go b/pkg/controllers/podfacts.go index bd2497afd..9234b45bc 100644 --- a/pkg/controllers/podfacts.go +++ b/pkg/controllers/podfacts.go @@ -504,8 +504,15 @@ func (p *PodFacts) findPodToRunVsql() (*PodFact, bool) { // order to run admintools // Will return false for second parameter if no pod could be found. func (p *PodFacts) findPodToRunAdmintools() (*PodFact, bool) { - // We prefer to pick a pod that is up. But failing that, we will pick one - // with vertica installed. + // Our preference for the pod is as follows: + // - up and not read-only + // - up and read-only + // - has vertica installation + for _, v := range p.Detail { + if v.upNode && !v.readOnly { + return v, true + } + } for _, v := range p.Detail { if v.upNode { return v, true