Merge pull request #10718 from vincepri/mhc-machineinfraready

🌱 MachineHealthCheck should take Machine's InfraReady condition
kubernetes-sigs · Jun 6, 2024 · ad80008 · ad80008
2 parents 2ed53da + b7e557b
commit ad80008
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 25 deletions.
diff --git a/api/v1beta1/clusterclass_types.go b/api/v1beta1/clusterclass_types.go
@@ -270,8 +270,16 @@ type MachineHealthCheckClass struct {
 	// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
 	UnhealthyRange *string `json:"unhealthyRange,omitempty"`
 
-	// Machines older than this duration without a node will be considered to have
-	// failed and will be remediated.
+	// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
+	// to consider a Machine unhealthy if a corresponding Node isn't associated
+	// through a `Spec.ProviderID` field.
+	//
+	// The duration set in this field is compared to the greatest of:
+	// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
+	// - Machine's infrastructure ready condition timestamp (if and when available)
+	// - Machine's metadata creation timestamp
+	//
+	// Defaults to 10 minutes.
 	// If you wish to disable this feature, set the value explicitly to 0.
 	// +optional
 	NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`

diff --git a/api/v1beta1/machinehealthcheck_types.go b/api/v1beta1/machinehealthcheck_types.go
@@ -64,9 +64,16 @@ type MachineHealthCheckSpec struct {
 	// +kubebuilder:validation:Pattern=^\[[0-9]+-[0-9]+\]$
 	UnhealthyRange *string `json:"unhealthyRange,omitempty"`
 
-	// Machines older than this duration without a node will be considered to have
-	// failed and will be remediated.
-	// If not set, this value is defaulted to 10 minutes.
+	// NodeStartupTimeout allows to set the maximum time for MachineHealthCheck
+	// to consider a Machine unhealthy if a corresponding Node isn't associated
+	// through a `Spec.ProviderID` field.
+	//
+	// The duration set in this field is compared to the greatest of:
+	// - Cluster's infrastructure and control plane ready condition timestamp (if and when available)
+	// - Machine's infrastructure ready condition timestamp (if and when available)
+	// - Machine's metadata creation timestamp
+	//
+	// Defaults to 10 minutes.
 	// If you wish to disable this feature, set the value explicitly to 0.
 	// +optional
 	NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`

diff --git a/api/v1beta1/zz_generated.openapi.go b/api/v1beta1/zz_generated.openapi.go
diff --git a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml
diff --git a/config/crd/bases/cluster.x-k8s.io_clusters.yaml b/config/crd/bases/cluster.x-k8s.io_clusters.yaml
diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
@@ -112,7 +112,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		return true, time.Duration(0)
 	}
 
-	// the node does not exist
+	// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
 	if t.nodeMissing {
 		logger.V(3).Info("Target is unhealthy: node is missing")
 		conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "")
@@ -122,14 +122,14 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 	// Don't penalize any Machine/Node if the control plane has not been initialized
 	// Exception of this rule are control plane machine itself, so the first control plane machine can be remediated.
 	if !conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) {
-		logger.V(3).Info("Not evaluating target health because the control plane has not yet been initialized")
+		logger.V(5).Info("Not evaluating target health because the control plane has not yet been initialized")
 		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
 		return false, 0
 	}
 
 	// Don't penalize any Machine/Node if the cluster infrastructure is not ready.
 	if !conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) {
-		logger.V(3).Info("Not evaluating target health because the cluster infrastructure is not ready")
+		logger.V(5).Info("Not evaluating target health because the cluster infrastructure is not ready")
 		// Return a nextCheck time of 0 because we'll get requeued when the Cluster is updated.
 		return false, 0
 	}
@@ -144,18 +144,27 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 
 		controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ControlPlaneInitializedCondition)
 		clusterInfraReady := conditions.GetLastTransitionTime(t.Cluster, clusterv1.InfrastructureReadyCondition)
+		machineInfraReady := conditions.GetLastTransitionTime(t.Machine, clusterv1.InfrastructureReadyCondition)
 		machineCreationTime := t.Machine.CreationTimestamp.Time
 
-		// Use the latest of the 3 times
+		// Use the latest of the following timestamps.
 		comparisonTime := machineCreationTime
-		logger.V(3).Info("Determining comparison time", "machineCreationTime", machineCreationTime, "clusterInfraReadyTime", clusterInfraReady, "controlPlaneInitializedTime", controlPlaneInitialized)
+		logger.V(5).Info("Determining comparison time",
+			"machineCreationTime", machineCreationTime,
+			"clusterInfraReadyTime", clusterInfraReady,
+			"controlPlaneInitializedTime", controlPlaneInitialized,
+			"machineInfraReadyTime", machineInfraReady,
+		)
 		if conditions.IsTrue(t.Cluster, clusterv1.ControlPlaneInitializedCondition) && controlPlaneInitialized != nil && controlPlaneInitialized.Time.After(comparisonTime) {
 			comparisonTime = controlPlaneInitialized.Time
 		}
 		if conditions.IsTrue(t.Cluster, clusterv1.InfrastructureReadyCondition) && clusterInfraReady != nil && clusterInfraReady.Time.After(comparisonTime) {
 			comparisonTime = clusterInfraReady.Time
 		}
-		logger.V(3).Info("Using comparison time", "time", comparisonTime)
+		if conditions.IsTrue(t.Machine, clusterv1.InfrastructureReadyCondition) && machineInfraReady != nil && machineInfraReady.Time.After(comparisonTime) {
+			comparisonTime = machineInfraReady.Time
+		}
+		logger.V(5).Info("Using comparison time", "time", comparisonTime)
 
 		timeoutDuration := timeoutForMachineToHaveNode.Duration
 		if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {

diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go
@@ -239,6 +239,23 @@ func TestHealthCheckTargets(t *testing.T) {
 	}
 
 	testMachine := newTestMachine("machine1", namespace, clusterName, "node1", mhcSelector)
+	testMachineWithInfraReady := testMachine.DeepCopy()
+	testMachineWithInfraReady.CreationTimestamp = metav1.NewTime(time.Now().Add(-100 * time.Second))
+	testMachineWithInfraReady.SetConditions(clusterv1.Conditions{
+		{
+			Type:               clusterv1.InfrastructureReadyCondition,
+			Status:             corev1.ConditionTrue,
+			Severity:           clusterv1.ConditionSeverityInfo,
+			LastTransitionTime: metav1.NewTime(testMachineWithInfraReady.CreationTimestamp.Add(50 * time.Second)),
+		},
+	})
+
+	nodeNotYetStartedTargetAndInfraReady := healthCheckTarget{
+		Cluster: cluster,
+		MHC:     testMHC,
+		Machine: testMachineWithInfraReady,
+		Node:    nil,
+	}
 
 	// Targets for when the node has not yet been seen by the Machine controller
 	testMachineCreated1200s := testMachine.DeepCopy()
@@ -416,6 +433,13 @@ func TestHealthCheckTargets(t *testing.T) {
 			expectedNeedsRemediation: []healthCheckTarget{},
 			expectedNextCheckTimes:   []time.Duration{timeoutForMachineToHaveNode - 400*time.Second},
 		},
+		{
+			desc:                     "when the node has not yet started for shorter than the timeout, and infra is ready",
+			targets:                  []healthCheckTarget{nodeNotYetStartedTargetAndInfraReady},
+			expectedHealthy:          []healthCheckTarget{},
+			expectedNeedsRemediation: []healthCheckTarget{},
+			expectedNextCheckTimes:   []time.Duration{timeoutForMachineToHaveNode - 50*time.Second},
+		},
 		{
 			desc:                              "when the node has not yet started for longer than the timeout",
 			targets:                           []healthCheckTarget{nodeNotYetStartedTarget1200s},