Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Short circuiting backoff #814

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ spec:
spec:
description: Specification of machine health check policy
properties:
failedNodeStartupTimeout:
description: Failed Machines that are older than this value and are
without a nodeRef or a providerID will be considered to have passed
the time period allocated for a manual fix and will be remediated.
Expects an unsigned duration string of decimal numbers each with
optional fraction and a unit suffix, eg "300ms", "1.5h" or "2h45m".
Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
maxUnhealthy:
anyOf:
- type: integer
Expand Down
15 changes: 11 additions & 4 deletions pkg/controller/machinehealthcheck/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func (r *ReconcileMachineHealthCheck) Reconcile(ctx context.Context, request rec
}

// health check all targets and reconcile mhc status
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout.Duration)
currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout.Duration, mhc.Spec.FailedNodeStartupTimeout.Duration)
healthyCount := len(currentHealthy)
mhc.Status.CurrentHealthy = &healthyCount
mhc.Status.ExpectedMachines = &totalTargets
Expand Down Expand Up @@ -459,13 +459,13 @@ func (r *ReconcileMachineHealthCheck) reconcileStatus(baseToPatch client.Patch,

// healthCheckTargets health checks a slice of targets
// and gives a data to measure the average health
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration) ([]target, []target, []time.Duration, []error) {
func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration, failedNodeStartupTimeout time.Duration) ([]target, []target, []time.Duration, []error) {
var errList []error
var needRemediationTargets, currentHealthy []target
var nextCheckTimes []time.Duration
for _, t := range targets {
klog.V(3).Infof("Reconciling %s: health checking", t.string())
needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode)
needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode, failedNodeStartupTimeout)
if err != nil {
klog.Errorf("Reconciling %s: error health checking: %v", t.string(), err)
errList = append(errList, err)
Expand Down Expand Up @@ -758,12 +758,19 @@ func (t *target) nodeName() string {
return ""
}

func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration) (bool, time.Duration, error) {
func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration, failedNodeStartupTimeout time.Duration) (bool, time.Duration, error) {
var nextCheckTimes []time.Duration
now := time.Now()

// machine has failed
if derefStringPointer(t.Machine.Status.Phase) == machinePhaseFailed {
// A machine that fails without getting a NodeRef of ProviderID means that there was a fatal configuration error.
// Typically remediating this machine will not actually resolve the issue.
// Delay remediation for this machine to allow manual intervention to resolve the configuration issue.
if (t.Machine.Status.NodeRef == nil || t.Machine.Spec.ProviderID == nil) && now.Sub(t.Machine.CreationTimestamp.Time) <= failedNodeStartupTimeout {
klog.Warningf("Machine %q is in failed phase, remediation is skipped to allow manual intervention", t.Machine.Name)
return false, time.Duration(0), nil
}
klog.V(3).Infof("%s: unhealthy: machine phase is %q", t.string(), machinePhaseFailed)
return true, time.Duration(0), nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ func TestReconcile(t *testing.T) {
machineHealthCheck := maotesting.NewMachineHealthCheck("machineHealthCheck")
nodeStartupTimeout := 15 * time.Minute
machineHealthCheck.Spec.NodeStartupTimeout = &metav1.Duration{Duration: nodeStartupTimeout}
machineHealthCheck.Spec.FailedNodeStartupTimeout = metav1.Duration{Duration: time.Hour}

machineHealthCheckNegativeMaxUnhealthy := maotesting.NewMachineHealthCheck("machineHealthCheckNegativeMaxUnhealthy")
negativeOne := intstr.FromInt(-1)
Expand All @@ -232,6 +233,19 @@ func TestReconcile(t *testing.T) {
machineHealthCheckPaused.Annotations = make(map[string]string)
machineHealthCheckPaused.Annotations[PausedAnnotation] = "test"

// Failed Machine with no node
failedVar := machinePhaseFailed
failedMachineWithoutNode := maotesting.NewMachine("failedMachineWithoutNode", "")
failedMachineWithoutNode.Status.Phase = &failedVar
failedMachineWithoutNode.CreationTimestamp = metav1.Now()

// Failed Machine with node
failedMachineWithNodeAndProviderId := maotesting.NewMachine("failedMachineWithoutNode", "recentlyUnhealthy")
failedMachineWithNodeAndProviderId.Status.Phase = &failedVar
mockProviderId := "mockProviderId"
failedMachineWithNodeAndProviderId.Spec.ProviderID = &mockProviderId
failedMachineWithNodeAndProviderId.CreationTimestamp = metav1.Now()

// remediationExternal
nodeUnhealthyForTooLong := maotesting.NewNode("nodeUnhealthyForTooLong", false)
nodeUnhealthyForTooLong.Annotations = map[string]string{
Expand Down Expand Up @@ -457,6 +471,44 @@ func TestReconcile(t *testing.T) {
},
},
},
{
name: "failed machine with no node",
machine: failedMachineWithoutNode,
node: nodeAlreadyDeleted,
mhc: machineHealthCheck,
expected: expectedReconcile{
result: reconcile.Result{},
error: false,
},
expectedEvents: []string{},
expectedStatus: &machinev1.MachineHealthCheckStatus{
ExpectedMachines: IntPtr(1),
CurrentHealthy: IntPtr(0),
RemediationsAllowed: 0,
Conditions: machinev1.Conditions{
remediationAllowedCondition,
},
},
},
{
name: "failed machine with node",
machine: failedMachineWithNodeAndProviderId,
node: nodeRecentlyUnhealthy,
mhc: machineHealthCheck,
expected: expectedReconcile{
result: reconcile.Result{},
error: false,
},
expectedEvents: []string{EventMachineDeleted},
expectedStatus: &machinev1.MachineHealthCheckStatus{
ExpectedMachines: IntPtr(1),
CurrentHealthy: IntPtr(0),
RemediationsAllowed: 0,
Conditions: machinev1.Conditions{
remediationAllowedCondition,
},
},
},
{
name: "machine unhealthy with MHC negative maxUnhealthy",
machine: machineUnhealthyForTooLong,
Expand Down Expand Up @@ -1537,6 +1589,7 @@ func TestGetNodeFromMachine(t *testing.T) {

func TestNeedsRemediation(t *testing.T) {
knownDate := metav1.Time{Time: time.Date(1985, 06, 03, 0, 0, 0, 0, time.Local)}
providerID := "mockProviderId"
machineFailed := machinePhaseFailed
testCases := []struct {
testCase string
Expand All @@ -1545,6 +1598,7 @@ func TestNeedsRemediation(t *testing.T) {
expectedNeedsRemediation bool
expectedNextCheck time.Duration
expectedError bool
failedNodeStartupTimeout time.Duration
}{
{
testCase: "healthy: does not met conditions criteria",
Expand Down Expand Up @@ -1830,18 +1884,21 @@ func TestNeedsRemediation(t *testing.T) {
expectedError: false,
},
{
testCase: "unhealthy: machine phase failed",
testCase: "unhealthy: machine phase failed without node",
target: &target{
Machine: machinev1.Machine{
TypeMeta: metav1.TypeMeta{Kind: "Machine"},
ObjectMeta: metav1.ObjectMeta{
Annotations: make(map[string]string),
Name: "machine",
Namespace: namespace,
Labels: map[string]string{"foo": "bar"},
OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}},
Annotations: make(map[string]string),
Name: "machine",
Namespace: namespace,
Labels: map[string]string{"foo": "bar"},
OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}},
CreationTimestamp: metav1.Now(),
},
Spec: machinev1.MachineSpec{
ProviderID: &providerID,
},
Spec: machinev1.MachineSpec{},
Status: machinev1.MachineStatus{
Phase: &machineFailed,
},
Expand Down Expand Up @@ -1878,9 +1935,71 @@ func TestNeedsRemediation(t *testing.T) {
},
},
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: false,
expectedNextCheck: time.Duration(0),
expectedError: false,
failedNodeStartupTimeout: time.Minute,
},
{
testCase: "unhealthy: machine phase failed with node",
target: &target{
Machine: machinev1.Machine{
TypeMeta: metav1.TypeMeta{Kind: "Machine"},
ObjectMeta: metav1.ObjectMeta{
Annotations: make(map[string]string),
Name: "machine",
Namespace: namespace,
Labels: map[string]string{"foo": "bar"},
OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}},
CreationTimestamp: metav1.Now(),
},
Spec: machinev1.MachineSpec{
ProviderID: &providerID,
},
Status: machinev1.MachineStatus{
Phase: &machineFailed,
NodeRef: &corev1.ObjectReference{
Name: "nodeUnhealthy",
Namespace: metav1.NamespaceNone,
},
},
},
Node: maotesting.NewNode("nodeUnhealthy", false),
MHC: machinev1.MachineHealthCheck{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
Namespace: namespace,
},
TypeMeta: metav1.TypeMeta{
Kind: "MachineHealthCheck",
},
Spec: machinev1.MachineHealthCheckSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"foo": "bar",
},
},
UnhealthyConditions: []machinev1.UnhealthyCondition{
{
Type: "Ready",
Status: "Unknown",
Timeout: metav1.Duration{Duration: 300 * time.Second},
},
{
Type: "Ready",
Status: "False",
Timeout: metav1.Duration{Duration: 300 * time.Second},
},
},
},
Status: machinev1.MachineHealthCheckStatus{},
},
},
timeoutForMachineToHaveNode: defaultNodeStartupTimeout,
expectedNeedsRemediation: true,
expectedNextCheck: time.Duration(0),
expectedError: false,
failedNodeStartupTimeout: time.Minute,
},
{
testCase: "healthy: meet conditions criteria but timeout",
Expand Down Expand Up @@ -1961,7 +2080,7 @@ func TestNeedsRemediation(t *testing.T) {

for _, tc := range testCases {
t.Run(tc.testCase, func(t *testing.T) {
needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode)
needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode, tc.failedNodeStartupTimeout)
if needsRemediation != tc.expectedNeedsRemediation {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, needsRemediation, tc.expectedNeedsRemediation)
}
Expand Down Expand Up @@ -2642,7 +2761,7 @@ func TestHealthCheckTargets(t *testing.T) {
recorder := record.NewFakeRecorder(2)
r := newFakeReconcilerWithCustomRecorder(recorder)
t.Run(tc.testCase, func(t *testing.T) {
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode)
currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode, 0)
if len(currentHealhty) != tc.currentHealthy {
t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, currentHealhty, tc.currentHealthy)
}
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.