-
Notifications
You must be signed in to change notification settings - Fork 159
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement DualRunning state to enable blue-green deploys (#186)
* Working version 1 * Create setup for blue green deploys * [WIP] Setup status sub-resource for blue green deploys * Updates * fix bug * Fixes * Make running jobs calculation idempotent * Fix bugs * Reset running jobs in recovering phase * Make status index calculation simpler * Add container env and annotations * Update CRD to v1beta2 * Update CRD to v1beta2 * Fix CRD update issues * Fix lint * Merge master and restore v1beta1 to original version * Upgrade integ test to v1beta2 * Backward compatibility changes * Work around status subresource bug * Rename status array to VersionStatuses and add comment on k8s bug * Remove DesiredApplicationCount * Minor updates * Minor updates * Initialize counter * Handle edge case for jobId * Debug * Debug * fixes * Fix edge case * Fix unit tests * Debug logs * Fix overwriting of versionstatuses * Remove debug logs * Implement DualRunning state * Happy path:Add DualRunning and Teardown states * Add unit tests and make Teardown a bool * Minor fixes * Merge master * Revert CRD upgrade * Keep Status.ClusterStatus and Status.JobStatus unchanged for Dual mode * Remove unwarranted changes * Add version to deployment names * Account for SavepointDisabled * Add integration test and fix delete * Fix build * Handle delete when there are two running jobs * Handle delete for blue green deploys correctly * Fix ingress * Fix ingress URL and event name * Allow teardown to be a version instead of bool and force-cancel job in teardown * Redesign teardown feature * Disallow switching between deployment modes * Disallow switching between deployment modes * Disallow deployment mode change * Fixes and address all review comments * Add docs and update state machine diagram * Update links to state machine pngs
- Loading branch information
Showing
27 changed files
with
1,684 additions
and
225 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
%% This file can be compiled into blue_green_state_machine.png by installing mermaidjs (https://mermaidjs.github.io/) and running | ||
%% mmdc -i blue_green_state_machine.mmd -o blue_green_state_machine.png -w 1732 -b transparent | ||
|
||
graph LR | ||
New --> ClusterStarting | ||
|
||
subgraph Running | ||
Running | ||
DeployFailed | ||
end | ||
|
||
subgraph Updating | ||
Running --> Updating | ||
Updating --> ClusterStarting | ||
DeployFailed --> Updating | ||
|
||
ClusterStarting -- savepoint disabled --> SubmittingJob | ||
ClusterStarting -- savepoint enabled --> Savepointing | ||
ClusterStarting -- Create fails --> DeployFailed | ||
|
||
Savepointing --> SubmittingJob | ||
Savepointing -- Savepoint fails --> Recovering | ||
|
||
Recovering --> SubmittingJob | ||
Recovering -- No externalized checkpoint --> RollingBackJob | ||
|
||
SubmittingJob -- first deploy --> Running | ||
SubmittingJob -- updating existing application --> DualRunning | ||
SubmittingJob -- job start fails --> RollingBackJob | ||
RollingBackJob --> DeployFailed | ||
|
||
DualRunning -- tearDownVersionHash set --> Running | ||
DualRunning -- tear down fails --> DeployFailed | ||
end | ||
|
||
linkStyle 4 stroke:#303030 | ||
linkStyle 5 stroke:#303030 | ||
linkStyle 6 stroke:#FF0000 | ||
linkStyle 8 stroke:#FF0000 | ||
linkStyle 10 stroke:#FF0000 | ||
linkStyle 11 stroke:#303030 | ||
linkStyle 12 stroke:#303030 | ||
linkStyle 13 stroke:#FF0000 | ||
linkStyle 14 stroke:#FF0000 | ||
linkStyle 15 stroke:#303030 | ||
linkStyle 16 stroke:#FF0000 |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package integ | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/lyft/flinkk8soperator/pkg/apis/app/v1beta1" | ||
"github.com/prometheus/common/log" | ||
. "gopkg.in/check.v1" | ||
v1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
) | ||
|
||
func WaitForUpdate(c *C, s *IntegSuite, name string, updateFn func(app *v1beta1.FlinkApplication), phase v1beta1.FlinkApplicationPhase, failurePhase v1beta1.FlinkApplicationPhase) *v1beta1.FlinkApplication { | ||
|
||
// update with new image. | ||
app, err := s.Util.Update(name, updateFn) | ||
c.Assert(err, IsNil) | ||
|
||
for { | ||
// keep trying until the new job is launched | ||
newApp, err := s.Util.GetFlinkApplication(name) | ||
c.Assert(err, IsNil) | ||
if newApp.Status.VersionStatuses[s.Util.GetCurrentStatusIndex(app)].JobStatus.JobID != "" { | ||
break | ||
} | ||
time.Sleep(100 * time.Millisecond) | ||
} | ||
|
||
c.Assert(s.Util.WaitForPhase(name, phase, failurePhase), IsNil) | ||
c.Assert(s.Util.WaitForAllTasksRunning(name), IsNil) | ||
|
||
newApp, _ := s.Util.GetFlinkApplication(name) | ||
return newApp | ||
} | ||
|
||
func (s *IntegSuite) TestUpdateWithBlueGreenDeploymentMode(c *C) { | ||
|
||
testName := "bluegreenupdate" | ||
const finalizer = "bluegreen.finalizers.test.com" | ||
|
||
// start a simple app | ||
config, err := s.Util.ReadFlinkApplication("test_app.yaml") | ||
c.Assert(err, IsNil, Commentf("Failed to read test app yaml")) | ||
|
||
config.Name = testName + "job" | ||
config.Spec.DeploymentMode = v1beta1.DeploymentModeBlueGreen | ||
config.ObjectMeta.Labels["integTest"] = testName | ||
config.Finalizers = append(config.Finalizers, finalizer) | ||
|
||
c.Assert(s.Util.CreateFlinkApplication(config), IsNil, | ||
Commentf("Failed to create flink application")) | ||
|
||
c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) | ||
c.Assert(s.Util.WaitForAllTasksRunning(config.Name), IsNil) | ||
|
||
pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). | ||
List(v1.ListOptions{LabelSelector: "integTest=" + testName}) | ||
c.Assert(err, IsNil) | ||
c.Assert(len(pods.Items), Equals, 3) | ||
for _, pod := range pods.Items { | ||
c.Assert(pod.Spec.Containers[0].Image, Equals, config.Spec.Image) | ||
} | ||
|
||
// test updating the app with a new image | ||
newApp := WaitForUpdate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { | ||
app.Spec.Image = NewImage | ||
}, v1beta1.FlinkApplicationDualRunning, v1beta1.FlinkApplicationDeployFailed) | ||
|
||
c.Assert(newApp.Spec.Image, Equals, NewImage) | ||
c.Assert(newApp.Status.SavepointPath, NotNil) | ||
|
||
pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). | ||
List(v1.ListOptions{LabelSelector: "integTest=" + testName}) | ||
c.Assert(err, IsNil) | ||
// We have 2 applications running | ||
c.Assert(len(pods.Items), Equals, 6) | ||
c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationDualRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) | ||
c.Assert(s.Util.GetJobID(newApp), NotNil) | ||
c.Assert(newApp.Status.UpdatingVersion, Equals, v1beta1.BlueFlinkApplication) | ||
c.Assert(newApp.Status.DeployVersion, Equals, v1beta1.GreenFlinkApplication) | ||
|
||
// TearDownVersionHash | ||
teardownVersion := newApp.Status.DeployVersion | ||
hashToTeardown := newApp.Status.DeployHash | ||
oldHash := newApp.Status.DeployHash | ||
log.Infof("Tearing down version %s", teardownVersion) | ||
newApp = WaitForUpdate(c, s, config.Name, func(app *v1beta1.FlinkApplication) { | ||
app.Spec.TearDownVersionHash = hashToTeardown | ||
}, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed) | ||
|
||
// wait for the old cluster to be cleaned up | ||
for { | ||
pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). | ||
List(v1.ListOptions{LabelSelector: "flink-app-hash=" + oldHash}) | ||
c.Assert(err, IsNil) | ||
if len(pods.Items) == 0 { | ||
break | ||
} | ||
time.Sleep(100 * time.Millisecond) | ||
} | ||
|
||
c.Assert(s.Util.WaitForPhase(config.Name, v1beta1.FlinkApplicationRunning, v1beta1.FlinkApplicationDeployFailed), IsNil) | ||
c.Assert(newApp.Status.TeardownHash, NotNil) | ||
c.Assert(newApp.Status.DeployVersion, Equals, v1beta1.BlueFlinkApplication) | ||
c.Assert(newApp.Status.VersionStatuses[0].JobStatus.JobID, NotNil) | ||
c.Assert(newApp.Status.VersionStatuses[1].JobStatus, Equals, v1beta1.FlinkJobStatus{}) | ||
|
||
pods, err = s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). | ||
List(v1.ListOptions{LabelSelector: "flink-app-hash=" + oldHash}) | ||
for _, pod := range pods.Items { | ||
log.Infof("Pod name %s", pod.Name) | ||
c.Assert(pod.Labels["flink-application-version"], Not(Equals), teardownVersion) | ||
} | ||
|
||
c.Assert(err, IsNil) | ||
c.Assert(len(pods.Items), Equals, 0) | ||
|
||
// cleanup | ||
c.Assert(s.Util.FlinkApps().Delete(newApp.Name, &v1.DeleteOptions{}), IsNil) | ||
var app *v1beta1.FlinkApplication | ||
for { | ||
app, err = s.Util.GetFlinkApplication(config.Name) | ||
c.Assert(err, IsNil) | ||
if len(app.Finalizers) == 1 && app.Finalizers[0] == finalizer { | ||
break | ||
} | ||
time.Sleep(100 * time.Millisecond) | ||
} | ||
|
||
job := s.Util.GetJobOverview(app) | ||
c.Assert(job["status"], Equals, "CANCELED") | ||
c.Assert(app.Status.SavepointPath, NotNil) | ||
|
||
// delete our finalizer | ||
app.Finalizers = []string{} | ||
_, err = s.Util.FlinkApps().Update(app) | ||
c.Assert(err, IsNil) | ||
|
||
for { | ||
pods, err := s.Util.KubeClient.CoreV1().Pods(s.Util.Namespace.Name). | ||
List(v1.ListOptions{LabelSelector: "integTest=" + testName}) | ||
c.Assert(err, IsNil) | ||
if len(pods.Items) == 0 { | ||
break | ||
} | ||
} | ||
log.Info("All pods torn down") | ||
} |
Oops, something went wrong.