Skip to content

Commit

Permalink
Don't drop traffic when upgrading a deployment fails (#14795)
Browse files Browse the repository at this point in the history
* Surface Replica failures over Progressing failures

When transforming the deployment status to the revision
we want to bubble up the more severe condition to Ready.

Since Replica failures will include a more actionable error
message this condition is preferred

* Stop always marking the revision healthy when the PA is Ready

This isn't accurate when the Revision has failed to rollout
an update to it's deployment

* Various updates to the revision reconciler

1. PA Reachability now depends on the status of the Deployment

If we have available replicas we don't mark the revision as
unreachable. This allows ongoing requests to be handled

2. Always propagate the K8s Deployment Status to the Revision.

We don't need to gate this depending on whether the Revision
required activation. Since the only two conditions we propagate
from the Deployment is Progressing and ReplicaSetFailure=False

3. Mark Revision as Deploying if the PA's service name isn't set

* test deployment failures don't drop traffic on upgrade

* fix boilerplate check
  • Loading branch information
dprotaso authored Feb 5, 2024
1 parent 05d1927 commit 1760f08
Show file tree
Hide file tree
Showing 16 changed files with 341 additions and 154 deletions.
51 changes: 32 additions & 19 deletions pkg/apis/serving/k8s_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,29 +50,42 @@ func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status {
// The absence of this condition means no failure has occurred. If we find it
// below, we'll overwrite this.
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "")

for _, cond := range ds.Conditions {
// TODO(jonjohnsonjr): Should we care about appsv1.DeploymentAvailable here?
switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
conds := []appsv1.DeploymentConditionType{
appsv1.DeploymentProgressing,
appsv1.DeploymentReplicaFailure,
}

for _, wantType := range conds {
for _, cond := range ds.Conditions {
if wantType != cond.Type {
continue
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)

switch cond.Type {
case appsv1.DeploymentProgressing:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message)
}
case appsv1.DeploymentReplicaFailure:
switch cond.Status {
case corev1.ConditionUnknown:
depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionTrue:
depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message)
case corev1.ConditionFalse:
depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady)
}
}

}
}

return s
}
41 changes: 38 additions & 3 deletions pkg/apis/serving/k8s_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ func TestTransformDeploymentStatus(t *testing.T) {
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionTrue,
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionUnknown,
Reason: "Deploying",
}},
},
}, {
Expand Down Expand Up @@ -147,7 +149,7 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Expand All @@ -158,12 +160,45 @@ func TestTransformDeploymentStatus(t *testing.T) {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something bag happened",
Message: "Something bad happened",
}},
},
}, {
name: "replica failure has priority over progressing",
ds: &appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}},
},
want: &duckv1.Status{
Conditions: []apis.Condition{{
Type: DeploymentConditionProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressingReason",
Message: "Something bad happened",
}, {
Type: DeploymentConditionReplicaSetReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}, {
Type: DeploymentConditionReady,
Status: corev1.ConditionFalse,
Reason: "ReplicaSetReason",
Message: "Something really bad happened",
}},
},
}}
Expand Down
7 changes: 0 additions & 7 deletions pkg/apis/serving/v1/revision_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package v1
import (
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
net "knative.dev/networking/pkg/apis/networking"
"knative.dev/pkg/kmeta"
Expand Down Expand Up @@ -144,9 +143,3 @@ func (rs *RevisionStatus) IsActivationRequired() bool {
c := revisionCondSet.Manage(rs).GetCondition(RevisionConditionActive)
return c != nil && c.Status != corev1.ConditionTrue
}

// IsReplicaSetFailure returns true if the deployment replicaset failed to create
func (rs *RevisionStatus) IsReplicaSetFailure(deploymentStatus *appsv1.DeploymentStatus) bool {
ds := serving.TransformDeploymentStatus(deploymentStatus)
return ds != nil && ds.GetCondition(serving.DeploymentConditionReplicaSetReady).IsFalse()
}
43 changes: 0 additions & 43 deletions pkg/apis/serving/v1/revision_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"testing"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"

Expand Down Expand Up @@ -268,45 +267,3 @@ func TestSetRoutingState(t *testing.T) {
t.Error("Expected default value for unparsable annotationm but got:", got)
}
}

func TestIsReplicaSetFailure(t *testing.T) {
revisionStatus := RevisionStatus{}
cases := []struct {
name string
status appsv1.DeploymentStatus
IsReplicaSetFailure bool
}{{
name: "empty deployment status should not be a failure",
status: appsv1.DeploymentStatus{},
}, {
name: "Ready deployment status should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue,
}},
},
}, {
name: "ReplicasetFailure true should be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue,
}},
},
IsReplicaSetFailure: true,
}, {
name: "ReplicasetFailure false should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionFalse,
}},
},
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got, want := revisionStatus.IsReplicaSetFailure(&tc.status), tc.IsReplicaSetFailure; got != want {
t.Errorf("IsReplicaSetFailure = %v, want: %v", got, want)
}
})
}
}
23 changes: 13 additions & 10 deletions pkg/apis/serving/v1/revision_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS

// PropagateAutoscalerStatus propagates autoscaler's status to the revision's status.
func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) {
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()

// Reflect the PA status in our own.
cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady)
rs.ActualReplicas = nil
Expand All @@ -183,20 +185,29 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
}

if cond == nil {
rs.MarkActiveUnknown("Deploying", "")
rs.MarkActiveUnknown(ReasonDeploying, "")

if !resUnavailable {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}
return
}

// Don't mark the resources available, if deployment status already determined
// it isn't so.
resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse()
if ps.IsScaleTargetInitialized() && !resUnavailable {
// Precondition for PA being initialized is SKS being active and
// that implies that |service.endpoints| > 0.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}

// Mark resource unavailable if we don't have a Service Name and the deployment is ready
// This can happen when we have initial scale set to 0
if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" {
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
}

switch cond.Status {
case corev1.ConditionUnknown:
rs.MarkActiveUnknown(cond.Reason, cond.Message)
Expand All @@ -222,14 +233,6 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
rs.MarkActiveFalse(cond.Reason, cond.Message)
case corev1.ConditionTrue:
rs.MarkActiveTrue()

// Precondition for PA being active is SKS being active and
// that implies that |service.endpoints| > 0.
//
// Note: This is needed for backwards compatibility as we're adding the new
// ScaleTargetInitialized condition to gate readiness.
rs.MarkResourcesAvailableTrue()
rs.MarkContainerHealthyTrue()
}
}

Expand Down
68 changes: 43 additions & 25 deletions pkg/apis/serving/v1/revision_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package v1

import (
"fmt"
"sort"
"testing"

Expand Down Expand Up @@ -536,6 +537,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becomes ready, making us active.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -551,6 +553,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler flipping back to Unknown causes Active become ongoing immediately.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -566,6 +569,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) {

// PodAutoscaler becoming unready makes Active false, but doesn't affect readiness.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand All @@ -582,32 +586,45 @@ func TestPropagateAutoscalerStatus(t *testing.T) {
apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t)
}

func TestPAResAvailableNoOverride(t *testing.T) {
r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) {
// Cases to test Ready condition
// we fix ScaleTargetInitialized to True
cases := []corev1.ConditionStatus{
corev1.ConditionTrue,
corev1.ConditionFalse,
corev1.ConditionUnknown,
}

// PodAutoscaler achieved initial scale.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: corev1.ConditionUnknown,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})
// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)
cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PA Status propagation overrode the ResourcesAvailable status")
for _, tc := range cases {
t.Run(fmt.Sprintf("ready is %v", tc), func(t *testing.T) {

r := &RevisionStatus{}
r.InitializeConditions()
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)

// Deployment determined that something's wrong, e.g. the only pod
// has crashed.
r.MarkResourcesAvailableFalse("somehow", "somewhere")
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Status: tc,
}, {
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
Status: corev1.ConditionTrue,
}},
},
})

// Verify we did not override this.
apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t)

cond := r.GetCondition(RevisionConditionResourcesAvailable)
if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant {
t.Error("PodAutoscalerStatus propagation overrode the ResourcesAvailable status")
}
})
}
}

Expand Down Expand Up @@ -671,6 +688,7 @@ func TestPropagateAutoscalerStatusRace(t *testing.T) {

// The PodAutoscaler might have been ready but it's scaled down already.
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
ServiceName: "some-service",
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
Expand Down
Loading

0 comments on commit 1760f08

Please sign in to comment.