From 3e6b1faeed08ec8784193f3066008deec98c0293 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Thu, 9 May 2024 17:00:27 +0200 Subject: [PATCH 1/7] Update README with the new remediationStrategy spec Signed-off-by: Carlo Lobrano --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f8056022..a97810ba 100644 --- a/README.md +++ b/README.md @@ -152,10 +152,18 @@ metadata: name: fenceagentsremediationtemplate-default namespace: default spec: - template: {} + template: + spec: + remediationStrategy: ``` > *Note*: FenceAgentsRemediationTemplate CR must be created in the same namespace that the FAR operator has been installed. + +The `.spec.template.spec.remediation_strategy` field can either be `ResourceDeletion` or `OutOfServiceTaint`: + +- `ResourceDeletion`: This remediation strategy removes the pods on the node, rather than the removal of the node object. This strategy recovers workloads faster. +- `OutOfServiceTaint`: This remediation strategy implicitly causes the removal of the pods and associated volume attachments on the node, rather than the removal of the node object. It achieves this by placing the `OutOfServiceTaint` taint on the node. The `OutOfServiceTaint` strategy also represents a non-graceful node shutdown. A non-graceful node shutdown occurs when a node is shutdown and not detected, instead of triggering an in-operating system shutdown. + Configuring NodeHealthCheck to use the example `fenceagentsremediationtemplate-default` template above. From 36333d0a6bb8777a190963115cfe52c8d1632737 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Mon, 13 May 2024 17:01:26 +0200 Subject: [PATCH 2/7] Update alm-example with remediation_strategy field Signed-off-by: Carlo Lobrano --- .../fence-agents-remediation.clusterserviceversion.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index 4b658842..4a23967c 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -11,6 +11,7 @@ metadata: "name": "worker-1" }, "spec": { + "remediation_strategy": "OutOfServiceTaint", "agent": "fence_ipmilan", "nodeparameters": { "--ipport": { From 24810c9a9e9f27c72123959dd6b73490d7d807e6 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Tue, 14 May 2024 18:06:39 +0200 Subject: [PATCH 3/7] Move description of remediationStrategy to FAR CR Signed-off-by: Carlo Lobrano --- README.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a97810ba..a2e91ab2 100644 --- a/README.md +++ b/README.md @@ -152,18 +152,10 @@ metadata: name: fenceagentsremediationtemplate-default namespace: default spec: - template: - spec: - remediationStrategy: + template: {} ``` > *Note*: FenceAgentsRemediationTemplate CR must be created in the same namespace that the FAR operator has been installed. - -The `.spec.template.spec.remediation_strategy` field can either be `ResourceDeletion` or `OutOfServiceTaint`: - -- `ResourceDeletion`: This remediation strategy removes the pods on the node, rather than the removal of the node object. This strategy recovers workloads faster. -- `OutOfServiceTaint`: This remediation strategy implicitly causes the removal of the pods and associated volume attachments on the node, rather than the removal of the node object. It achieves this by placing the `OutOfServiceTaint` taint on the node. The `OutOfServiceTaint` strategy also represents a non-graceful node shutdown. A non-graceful node shutdown occurs when a node is shutdown and not detected, instead of triggering an in-operating system shutdown. - Configuring NodeHealthCheck to use the example `fenceagentsremediationtemplate-default` template above. @@ -200,6 +192,9 @@ The CR includes the following parameters: * `retrycount` - number of times to retry the fence agent in case of failure. The default is 5. * `retryinterval` - interval between retries in seconds. The default is "5s". * `timeout` - timeout for the fence agent in seconds. The default is "60s". +* `remediationStrategy` - either `ResourceDeletion` or `OutOfServiceTaint`: + * `ResourceDeletion`: This remediation strategy removes the pods on the node. + * `OutOfServiceTaint`: This remediation strategy implicitly causes the removal of the pods and associated volume attachments on the node. It achieves this by placing the [`OutOfServiceTaint` taint](https://kubernetes.io/docs/reference/labels-annotations-taints/#node-kubernetes-io-out-of-service) on the node. The FenceAgentsRemediation CR is created by the administrator and is used to trigger the fence agent on a specific node. The CR includes an *agent* field for the fence agent name, *sharedparameters* field with all the shared, not specific to a node, parameters, and a *nodeparameters* field to specify the parameters for the fenced node. For better understanding please see the below example of FenceAgentsRemediation CR for node `worker-1` (see it also as the [sample FAR](https://github.com/medik8s/fence-agents-remediation/blob/main/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml)): @@ -228,6 +223,7 @@ spec: worker-0: "6233" worker-1: "6234" worker-2: "6235" + remediationStrategy: OutOfServiceTaint ``` ## Tests From 3ff441e196d5d5ab1829adf2e80ee08c33227115 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Tue, 14 May 2024 18:07:41 +0200 Subject: [PATCH 4/7] Update config/sample with remediationStrategy field Signed-off-by: Carlo Lobrano --- .../fence-agents-remediation.clusterserviceversion.yaml | 2 +- ...ence-agents-remediation_v1alpha1_fenceagentsremediation.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index 4a23967c..1772443c 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -11,7 +11,6 @@ metadata: "name": "worker-1" }, "spec": { - "remediation_strategy": "OutOfServiceTaint", "agent": "fence_ipmilan", "nodeparameters": { "--ipport": { @@ -23,6 +22,7 @@ metadata: "worker-2": "6235" } }, + "remediationStrategy": "ResourceDeletion", "retrycount": 5, "retryinterval": "5s", "sharedparameters": { diff --git a/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml b/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml index 84f021fa..41002450 100644 --- a/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml +++ b/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml @@ -21,3 +21,4 @@ spec: worker-0: "6233" worker-1: "6234" worker-2: "6235" + remediationStrategy: ResourceDeletion From 0a550e0e4cb0c868a6092824315fbbd8b4963803 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 15 May 2024 09:49:20 +0200 Subject: [PATCH 5/7] Use ResourceDeletion as remediationStrategy in CR example Signed-off-by: Carlo Lobrano --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2e91ab2..f1e21eb8 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ spec: worker-0: "6233" worker-1: "6234" worker-2: "6235" - remediationStrategy: OutOfServiceTaint + remediationStrategy: ResourceDeletion ``` ## Tests From dba7b66200c21371b593a28e6271dc53315737a4 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 15 May 2024 11:15:16 +0200 Subject: [PATCH 6/7] Add missing spec descriptor for RemediationStrategy field Signed-off-by: Carlo Lobrano --- api/v1alpha1/fenceagentsremediation_types.go | 1 + ...nts-remediation.clusterserviceversion.yaml | 20 +++++++++++++++++++ ...nts-remediation.clusterserviceversion.yaml | 20 +++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 89ef2a00..f0c21605 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -96,6 +96,7 @@ type FenceAgentsRemediationSpec struct { // that enables automatic deletion of pv-attached pods on failed nodes, "out-of-service" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+. // +kubebuilder:default:="ResourceDeletion" // +kubebuilder:validation:Enum=ResourceDeletion;OutOfServiceTaint + // +operator-sdk:csv:customresourcedefinitions:type=spec RemediationStrategy RemediationStrategyType `json:"remediationStrategy,omitempty"` } diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index 1772443c..34f82019 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -84,6 +84,16 @@ spec: node that is fenced, since they are node specific displayName: Node Parameters path: nodeparameters + - description: RemediationStrategy is the remediation method for unhealthy nodes. + Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion". + ResourceDeletion will iterate over all pods related to the unhealthy node + and delete them. OutOfServiceTaint will add the out-of-service taint which + is a new well-known taint "node.kubernetes.io/out-of-service" that enables + automatic deletion of pv-attached pods on failed nodes, "out-of-service" + taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version + 4.13+. + displayName: Remediation Strategy + path: remediationStrategy - description: RetryCount is the number of times the fencing agent will be executed displayName: Retry Count path: retrycount @@ -130,6 +140,16 @@ spec: node that is fenced, since they are node specific displayName: Node Parameters path: template.spec.nodeparameters + - description: RemediationStrategy is the remediation method for unhealthy nodes. + Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion". + ResourceDeletion will iterate over all pods related to the unhealthy node + and delete them. OutOfServiceTaint will add the out-of-service taint which + is a new well-known taint "node.kubernetes.io/out-of-service" that enables + automatic deletion of pv-attached pods on failed nodes, "out-of-service" + taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version + 4.13+. + displayName: Remediation Strategy + path: template.spec.remediationStrategy - description: RetryCount is the number of times the fencing agent will be executed displayName: Retry Count path: template.spec.retrycount diff --git a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml index b1539a23..d75d98b4 100644 --- a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml @@ -39,6 +39,16 @@ spec: node that is fenced, since they are node specific displayName: Node Parameters path: nodeparameters + - description: RemediationStrategy is the remediation method for unhealthy nodes. + Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion". + ResourceDeletion will iterate over all pods related to the unhealthy node + and delete them. OutOfServiceTaint will add the out-of-service taint which + is a new well-known taint "node.kubernetes.io/out-of-service" that enables + automatic deletion of pv-attached pods on failed nodes, "out-of-service" + taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version + 4.13+. + displayName: Remediation Strategy + path: remediationStrategy - description: RetryCount is the number of times the fencing agent will be executed displayName: Retry Count path: retrycount @@ -85,6 +95,16 @@ spec: node that is fenced, since they are node specific displayName: Node Parameters path: template.spec.nodeparameters + - description: RemediationStrategy is the remediation method for unhealthy nodes. + Currently, it could be either "OutOfServiceTaint" or "ResourceDeletion". + ResourceDeletion will iterate over all pods related to the unhealthy node + and delete them. OutOfServiceTaint will add the out-of-service taint which + is a new well-known taint "node.kubernetes.io/out-of-service" that enables + automatic deletion of pv-attached pods on failed nodes, "out-of-service" + taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version + 4.13+. + displayName: Remediation Strategy + path: template.spec.remediationStrategy - description: RetryCount is the number of times the fencing agent will be executed displayName: Retry Count path: template.spec.retrycount From 4f9eb8d7259b029a14116ad4bc34c16c733228e2 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 22 May 2024 08:20:29 +0200 Subject: [PATCH 7/7] Rephrase remediation strategies section Use "delete" in place of "remove" as it is more appropriate, and put Out-of-service taint first in the list. Signed-off-by: Carlo Lobrano --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f1e21eb8..0653a967 100644 --- a/README.md +++ b/README.md @@ -192,9 +192,9 @@ The CR includes the following parameters: * `retrycount` - number of times to retry the fence agent in case of failure. The default is 5. * `retryinterval` - interval between retries in seconds. The default is "5s". * `timeout` - timeout for the fence agent in seconds. The default is "60s". -* `remediationStrategy` - either `ResourceDeletion` or `OutOfServiceTaint`: - * `ResourceDeletion`: This remediation strategy removes the pods on the node. - * `OutOfServiceTaint`: This remediation strategy implicitly causes the removal of the pods and associated volume attachments on the node. It achieves this by placing the [`OutOfServiceTaint` taint](https://kubernetes.io/docs/reference/labels-annotations-taints/#node-kubernetes-io-out-of-service) on the node. +* `remediationStrategy` - either `OutOfServiceTaint` or `ResourceDeletion`: + * `OutOfServiceTaint`: This remediation strategy implicitly causes the deletion of the pods and the detachment of the associated volumes on the node. It achieves this by placing the [`OutOfServiceTaint` taint](https://kubernetes.io/docs/reference/labels-annotations-taints/#node-kubernetes-io-out-of-service) on the node. + * `ResourceDeletion`: This remediation strategy deletes the pods on the node. The FenceAgentsRemediation CR is created by the administrator and is used to trigger the fence agent on a specific node. The CR includes an *agent* field for the fence agent name, *sharedparameters* field with all the shared, not specific to a node, parameters, and a *nodeparameters* field to specify the parameters for the fenced node. For better understanding please see the below example of FenceAgentsRemediation CR for node `worker-1` (see it also as the [sample FAR](https://github.com/medik8s/fence-agents-remediation/blob/main/config/samples/fence-agents-remediation_v1alpha1_fenceagentsremediation.yaml)):