Skip to content

Commit

Permalink
Merge pull request #10718 from vincepri/mhc-machineinfraready
Browse files Browse the repository at this point in the history
🌱 MachineHealthCheck should take Machine's InfraReady condition
  • Loading branch information
k8s-ci-robot authored Jun 6, 2024
2 parents 2ed53da + b7e557b commit ad80008
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 25 deletions.
12 changes: 10 additions & 2 deletions api/v1beta1/clusterclass_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,16 @@ type MachineHealthCheckClass struct {
// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
UnhealthyRange *string `json:"unhealthyRange,omitempty"`

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
// to consider a Machine unhealthy if a corresponding Node isn't associated
// through a `Spec.ProviderID` field.
//
// The duration set in this field is compared to the greatest of:
// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
// - Machine's infrastructure ready condition timestamp (if and when available)
// - Machine's metadata creation timestamp
//
// Defaults to 10 minutes.
// If you wish to disable this feature, set the value explicitly to 0.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
Expand Down
13 changes: 10 additions & 3 deletions api/v1beta1/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,16 @@ type MachineHealthCheckSpec struct {
// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
UnhealthyRange *string `json:"unhealthyRange,omitempty"`

// Machines older than this duration without a node will be considered to have
// failed and will be remediated.
// If not set, this value is defaulted to 10 minutes.
// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
// to consider a Machine unhealthy if a corresponding Node isn't associated
// through a `Spec.ProviderID` field.
//
// The duration set in this field is compared to the greatest of:
// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
// - Machine's infrastructure ready condition timestamp (if and when available)
// - Machine's metadata creation timestamp
//
// Defaults to 10 minutes.
// If you wish to disable this feature, set the value explicitly to 0.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
Expand Down
6 changes: 3 additions & 3 deletions api/v1beta1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 24 additions & 4 deletions config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 24 additions & 4 deletions config/crd/bases/cluster.x-k8s.io_clusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
return true, time.Duration(0)
}

// the node does not exist
// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
if t.nodeMissing {
logger.V(3).Info("Target is unhealthy: node is missing")
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "")
Expand All @@ -122,14 +122,14 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
// Don't penalize any Machine/Node if the control plane has not been initialized
// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
logger.V(5).Info("Not evaluating target health because the control plane has not yet been initialized")
// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
return false, 0
}

// Don't penalize any Machine/Node if the cluster infrastructure is not ready.
if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) {
logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready")
logger.V(5).Info("Not evaluating target health because the cluster infrastructure is not ready")
// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
return false, 0
}
Expand All @@ -144,18 +144,27 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi

controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition)
clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition)
machineInfraReady := conditions.GetLastTransitionTime(t.Machine, clusterv1.InfrastructureReadyCondition)
machineCreationTime := t.Machine.CreationTimestamp.Time

// Use the latest of the 3 times
// Use the latest of the following timestamps.
comparisonTime := machineCreationTime
logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized)
logger.V(5).Info("Determining comparison time",
"machineCreationTime", machineCreationTime,
"clusterInfraReadyTime", clusterInfraReady,
"controlPlaneInitializedTime", controlPlaneInitialized,
"machineInfraReadyTime", machineInfraReady,
)
if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) {
comparisonTime = controlPlaneInitialized.Time
}
if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) {
comparisonTime = clusterInfraReady.Time
}
logger.V(3).Info("Using comparison time", "time", comparisonTime)
if conditions.IsTrue(t.Machine, clusterv1.InfrastructureReadyCondition) && machineInfraReady != nil && machineInfraReady.Time.After(comparisonTime) {
comparisonTime = machineInfraReady.Time
}
logger.V(5).Info("Using comparison time", "time", comparisonTime)

timeoutDuration := timeoutForMachineToHaveNode.Duration
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,23 @@ func TestHealthCheckTargets(t *testing.T) {
}

testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector)
testMachineWithInfraReady := testMachine.DeepCopy()
testMachineWithInfraReady.CreationTimestamp = metav1.NewTime(time.Now().Add(-100 * time.Second))
testMachineWithInfraReady.SetConditions(clusterv1.Conditions{
{
Type: clusterv1.InfrastructureReadyCondition,
Status: corev1.ConditionTrue,
Severity: clusterv1.ConditionSeverityInfo,
LastTransitionTime: metav1.NewTime(testMachineWithInfraReady.CreationTimestamp.Add(50 * time.Second)),
},
})

nodeNotYetStartedTargetAndInfraReady := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineWithInfraReady,
Node: nil,
}

// Targets for when the node has not yet been seen by the Machine controller
testMachineCreated1200s := testMachine.DeepCopy()
Expand Down Expand Up @@ -416,6 +433,13 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNeedsRemediation: []healthCheckTarget{},
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
},
{
desc: "when the node has not yet started for shorter than the timeout, and infra is ready",
targets: []healthCheckTarget{nodeNotYetStartedTargetAndInfraReady},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{},
expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 50*time.Second},
},
{
desc: "when the node has not yet started for longer than the timeout",
targets: []healthCheckTarget{nodeNotYetStartedTarget1200s},
Expand Down

0 comments on commit ad80008

Please sign in to comment.