diff --git a/charts/aws-ebs-csi-driver/templates/clusterrole-csi-node.yaml b/charts/aws-ebs-csi-driver/templates/clusterrole-csi-node.yaml index 3ca368efb6..3e5382216c 100644 --- a/charts/aws-ebs-csi-driver/templates/clusterrole-csi-node.yaml +++ b/charts/aws-ebs-csi-driver/templates/clusterrole-csi-node.yaml @@ -8,4 +8,4 @@ metadata: rules: - apiGroups: [""] resources: ["nodes"] - verbs: ["get"] + verbs: ["get", "patch"] diff --git a/charts/aws-ebs-csi-driver/templates/node.yaml b/charts/aws-ebs-csi-driver/templates/node.yaml index 520c09ea17..e5a561ae00 100644 --- a/charts/aws-ebs-csi-driver/templates/node.yaml +++ b/charts/aws-ebs-csi-driver/templates/node.yaml @@ -58,6 +58,9 @@ spec: {{- with .Values.node.volumeAttachLimit }} - --volume-attach-limit={{ . }} {{- end }} + {{- if .Values.node.startUpTaint }} + - --start-up-taint=true + {{- end }} {{- with .Values.node.loggingFormat }} - --logging-format={{ . }} {{- end }} diff --git a/charts/aws-ebs-csi-driver/values.yaml b/charts/aws-ebs-csi-driver/values.yaml index 4877fe4f3b..70d3dc7036 100644 --- a/charts/aws-ebs-csi-driver/values.yaml +++ b/charts/aws-ebs-csi-driver/values.yaml @@ -273,6 +273,7 @@ node: enableWindows: false # The "maximum number of attachable volumes" per node volumeAttachLimit: + startUpTaint: false updateStrategy: type: RollingUpdate rollingUpdate: diff --git a/cmd/main.go b/cmd/main.go index 53bc2c7a5a..dbd874bd01 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -61,6 +61,7 @@ func main() { driver.WithKubernetesClusterID(options.ControllerOptions.KubernetesClusterID), driver.WithAwsSdkDebugLog(options.ControllerOptions.AwsSdkDebugLog), driver.WithWarnOnInvalidTag(options.ControllerOptions.WarnOnInvalidTag), + driver.WithStartupTaint(options.NodeOptions.StartupTaintRemoval), ) if err != nil { klog.ErrorS(err, "failed to create driver") diff --git a/cmd/options/node_options.go b/cmd/options/node_options.go index 1374016f1d..12d51281d7 100644 --- a/cmd/options/node_options.go +++ b/cmd/options/node_options.go @@ -31,8 +31,13 @@ type NodeOptions struct { // itself (dynamically discovering the maximum number of attachable volume per EC2 machine type, see also // https://github.com/kubernetes-sigs/aws-ebs-csi-driver/issues/347). VolumeAttachLimit int64 + + // StartupTaintRemoval configures the node service behavior whether to remove customized preset taint `agent-not-ready` + // on the node. This taint prevent any workload pods from scheduling prior the csi node service is startup. + StartupTaintRemoval bool } func (o *NodeOptions) AddFlags(fs *flag.FlagSet) { fs.Int64Var(&o.VolumeAttachLimit, "volume-attach-limit", -1, "Value for the maximum number of volumes attachable per node. If specified, the limit applies to all nodes. If not specified, the value is approximated from the instance type.") + fs.BoolVar(&o.StartupTaintRemoval, "start-up-taint", false, "To enable the node service remove node-ready taint after startup (default to false).") } diff --git a/deploy/kubernetes/base/clusterrole-csi-node.yaml b/deploy/kubernetes/base/clusterrole-csi-node.yaml index f48b05092b..91c581e387 100644 --- a/deploy/kubernetes/base/clusterrole-csi-node.yaml +++ b/deploy/kubernetes/base/clusterrole-csi-node.yaml @@ -9,4 +9,4 @@ metadata: rules: - apiGroups: [""] resources: ["nodes"] - verbs: ["get"] + verbs: ["get", "patch"] diff --git a/docs/install.md b/docs/install.md index 5d241571d7..516d755322 100644 --- a/docs/install.md +++ b/docs/install.md @@ -41,6 +41,19 @@ kubectl create secret generic aws-secret \ ### Configure driver toleration settings By default, the driver controller tolerates taint `CriticalAddonsOnly` and has `tolerationSeconds` configured as `300`; and the driver node tolerates all taints. If you don't want to deploy the driver node on all nodes, please set Helm `Value.node.tolerateAllTaints` to false before deployment. Add policies to `Value.node.tolerations` to configure customized toleration for nodes. +### Configure node taint and driver start-up taint +In some cases when new node frequesntly join the cluster, workload pods can be scheduled to a new node ahead of the csi-node start-up and ready on that node. This race condition between workload pod and csi-node pod will cause the workload pod fails to mount PVC at first place. + +To help overcome this situation, CSI Driver node can manipulate Kubernetes’s taints on a given node to help preventing pods from starting before CSI Driver's node pod runs on this node. + +To configure start-up taint, the cluster administrator can places a taint with key `node.ebs.csi.aws.com/agent-not-ready` on a given uninitialized node(or node group). This prevents pods that don’t have a matching toleration from either being scheduled or altogether running on the node until the taint is removed. If use Helm to install CSI Driver, set `.Value.node.startUpTaint` to `true`. Once the CSI Driver pod runs on the node, initializes and ready, it will removes the aforementioned taint. After that, workload pods will start being scheduled and running on the node, having their networking managed by Cilium. + +The taint effect can be set to `NoSchedule` or `NoExecute`, + +* If NoSchedule is used, pods won’t be scheduled to a node until CSI Driver remove the taint. However, one practical effect of this is that if some external process (such as a reboot) resets the configuration on the node, pods that were already scheduled will be allowed to start concurrently with csi-node when the node next reboots +* If NoExecute is used (recommended), pods won’t be executed (nor scheduled) on a node until CSI Driver removes the taint. One practical effect of this is that whenever the taint is added back to the node by some external process (such as during an cluster upgrade), pods will be evicted from the node until CSI Driver remove the taint again after startup. + + ### Deploy driver You may deploy the EBS CSI driver via Kustomize, Helm, or as an [Amazon EKS managed add-on](https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html). diff --git a/docs/options.md b/docs/options.md index 8df01fc1d4..de5b3278ff 100644 --- a/docs/options.md +++ b/docs/options.md @@ -1,11 +1,13 @@ # Driver Options + There are a couple of driver options that can be passed as arguments when starting the driver container. | Option argument | value sample | default | Description | |-----------------------------|---------------------------------------------------|-----------------------------------------------------|---------------------| | endpoint | tcp://127.0.0.1:10000/ | unix:///var/lib/csi/sockets/pluginproxy/csi.sock | The socket on which the driver will listen for CSI RPCs| | volume-attach-limit | 1,2,3 ... | -1 | Value for the maximum number of volumes attachable per node. If specified, the limit applies to all nodes. If not specified, the value is approximated from the instance type| +| start-up-taint | true | false | If set to `true`, the driver will enable start-up taint support. Node taint with key `node.ebs.csi.aws.com/agent-not-ready` will be removed during the driver start-up | | extra-tags | key1=value1,key2=value2 | | Tags attached to each dynamically provisioned resource| | k8s-tag-cluster-id | aws-cluster-id-1 | | ID of the Kubernetes cluster used for tagging provisioned EBS volumes| -| aws-sdk-debug-log | true | false | If set to true, the driver will enable the aws sdk debug log level| -| logging-format | json | text | Sets the log format. Permitted formats: text, json| +| aws-sdk-debug-log | true | false | If set to `true`, the driver will enable the aws sdk debug log level| +| logging-format | json | text | Sets the log format. Permitted formats: `text`, `json`| diff --git a/pkg/cloud/patch_node.go b/pkg/cloud/patch_node.go new file mode 100644 index 0000000000..b84525426c --- /dev/null +++ b/pkg/cloud/patch_node.go @@ -0,0 +1,77 @@ +package cloud + +import ( + "context" + "encoding/json" + "fmt" + "os" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sTypes "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" +) + +type JSONPatch struct { + OP string `json:"op,omitempty"` + Path string `json:"path,omitempty"` + Value interface{} `json:"value"` +} + +// RemoveNodeTaint patched the node, removes the taint that match NodeTaintKey +func RemoveNodeTaint(k8sAPIClient KubernetesAPIClient, NodeTaintKey string) error { + nodeName := os.Getenv("CSI_NODE_NAME") + if nodeName == "" { + return fmt.Errorf("CSI_NODE_NAME env var not set") + } + + clientset, err := k8sAPIClient() + if err != nil { + return err + } + + node, err := clientset.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + + var taints []corev1.Taint + hasTaint := false + for _, taint := range node.Spec.Taints { + if taint.Key != NodeTaintKey { + taints = append(taints, taint) + } else { + hasTaint = true + klog.InfoS("Node taint found") + } + } + + if !hasTaint { + return fmt.Errorf("could not find node taint, key: %v, node: %v", NodeTaintKey, nodeName) + } + + createStatusAndNodePatch := []JSONPatch{ + { + OP: "test", + Path: "/spec/taints", + Value: node.Spec.Taints, + }, + { + OP: "replace", + Path: "/spec/taints", + Value: taints, + }, + } + + patch, err := json.Marshal(createStatusAndNodePatch) + if err != nil { + return err + } + + _, err = clientset.CoreV1().Nodes().Patch(context.TODO(), nodeName, k8sTypes.JSONPatchType, patch, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch node when removing taint: error %w", err) + } + klog.InfoS("Successfully removed taint", "key", NodeTaintKey, "node", nodeName) + return nil +} diff --git a/pkg/driver/constants.go b/pkg/driver/constants.go index d7e03d6bb7..7a157d86a6 100644 --- a/pkg/driver/constants.go +++ b/pkg/driver/constants.go @@ -161,6 +161,13 @@ const ( FSTypeNtfs = "ntfs" ) +// constants for node agent not ready taint label +const ( + // AgentNotReadyNodeTaint is a node taint which prevents pods from being + // scheduled. Once csi-node startup it is removed from the node. + AgentNotReadyNodeTaintKey = "node.ebs.csi.aws.com/agent-not-ready" +) + // BlockSizeExcludedFSTypes contains the filesystems that a custom block size is *NOT* supported on var ( BlockSizeExcludedFSTypes = map[string]struct{}{ diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 4558cf30e8..5a308746e5 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -67,6 +67,7 @@ type DriverOptions struct { kubernetesClusterID string awsSdkDebugLog bool warnOnInvalidTag bool + startupTaintRemoval bool } func NewDriver(options ...func(*DriverOptions)) (*Driver, error) { @@ -198,3 +199,9 @@ func WithWarnOnInvalidTag(warnOnInvalidTag bool) func(*DriverOptions) { o.warnOnInvalidTag = warnOnInvalidTag } } + +func WithStartupTaint(startupTaintRemoval bool) func(*DriverOptions) { + return func(o *DriverOptions) { + o.startupTaintRemoval = startupTaintRemoval + } +} diff --git a/pkg/driver/node.go b/pkg/driver/node.go index 68308fc803..555cbacca0 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -41,7 +41,7 @@ const ( // VolumeOperationAlreadyExists is message fmt returned to CO when there is another in-flight call on the given volumeID VolumeOperationAlreadyExists = "An operation with the given volume=%q is already in progress" - //sbeDeviceVolumeAttachmentLimit refers to the maximum number of volumes that can be attached to an instance on snow. + // sbeDeviceVolumeAttachmentLimit refers to the maximum number of volumes that can be attached to an instance on snow. sbeDeviceVolumeAttachmentLimit = 10 ) @@ -560,6 +560,12 @@ func (d *nodeService) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoReque topology := &csi.Topology{Segments: segments} + if d.driverOptions.startupTaintRemoval { + err := nodeRemoveNotReadyTaint() + if err != nil { + klog.InfoS("Node agent-not-ready taint error", "error", err) + } + } return &csi.NodeGetInfoResponse{ NodeId: d.metadata.GetInstanceID(), MaxVolumesPerNode: d.getVolumesLimit(), @@ -799,3 +805,13 @@ func collectMountOptions(fsType string, mntFlags []string) []string { } return options } + +func nodeRemoveNotReadyTaint() error { + // check and remove the taint + klog.InfoS("NodeRemoveNotReadyTaint: Node taint removal start", "Taint", AgentNotReadyNodeTaintKey) + err := cloud.RemoveNodeTaint(cloud.DefaultKubernetesAPIClient, AgentNotReadyNodeTaintKey) + if err != nil { + return err + } + return nil +}