diff --git a/examples/tfvars/single-node.tfvars b/examples/tfvars/single-node.tfvars index 29319668..ecc88ce0 100644 --- a/examples/tfvars/single-node.tfvars +++ b/examples/tfvars/single-node.tfvars @@ -23,9 +23,8 @@ single_node = { instance_type = "m6i.2xlarge" name = "dev-v2" ami = { - name_prefix = "dev-v2_" - owner = "977170443939" - + name_prefix = "amazon-eks-node-al2023-x86_64-standard-" + owner = "602401143452" } labels = { "dominodatalab.com/node-pool" = "default", @@ -41,5 +40,5 @@ storage = { } eks = { - k8s_version = "1.27" + k8s_version = "1.30" } diff --git a/modules/eks/submodules/k8s/templates/k8s-functions.sh.tftpl b/modules/eks/submodules/k8s/templates/k8s-functions.sh.tftpl index 36c035d5..0b187499 100644 --- a/modules/eks/submodules/k8s/templates/k8s-functions.sh.tftpl +++ b/modules/eks/submodules/k8s/templates/k8s-functions.sh.tftpl @@ -2,6 +2,7 @@ RED="\e[31m" GREEN="\e[32m" +YELLOW="\e[33m" EC="\e[0m" KUBECONFIG="${kubeconfig_path}" @@ -111,7 +112,9 @@ install_calico() { local sleep_duration=10 for i in $(seq 1 $max_retries); do - helm_cmd upgrade "calico-tigera-operator" \ + echo "Attempt $i of $max_retries..." + + if helm_cmd upgrade "calico-tigera-operator" \ tigera-operator \ --repo "https://projectcalico.docs.tigera.io/charts" \ --version "${calico_version}" \ @@ -125,20 +128,23 @@ install_calico() { --wait \ --timeout 10m \ --create-namespace \ - --install + --install; then - if [ $? -eq 0 ]; then + printf "$GREEN Calico installation succeeded. $EC \n" break - fi - if [ $i -lt $max_retries ]; then - echo "Attempt $i failed. Retrying in $${sleep_duration}s..." - sleep $sleep_duration else - printf "$RED Maximum attempts reached. Exiting. $EC \n" - exit 1 - fi + printf "$YELLOW Helm install attempt $i failed. $EC \n" + if [ $i -lt $max_retries ]; then + printf "Retrying in $sleep_duration s..." + sleep $sleep_duration + else + printf "$RED Maximum attempts reached. Exiting. $EC \n" + exit 1 + fi + + fi done } diff --git a/modules/external-deployments/operator_role_policies.tf b/modules/external-deployments/operator_role_policies.tf index afaa06fa..90ccbf7f 100644 --- a/modules/external-deployments/operator_role_policies.tf +++ b/modules/external-deployments/operator_role_policies.tf @@ -126,6 +126,9 @@ data "aws_iam_policy_document" "in_account_policies" { "sagemaker:DescribeModel", "sagemaker:InvokeEndpoint", "sagemaker:InvokeEndpointWithResponseStream", + "sagemaker:ListEndpointConfigs", + "sagemaker:ListEndpoints", + "sagemaker:ListModels", "sagemaker:UpdateEndpoint", "sagemaker:UpdateEndpointWeightsAndCapacities" ] diff --git a/modules/flyte/README.md b/modules/flyte/README.md index f411c348..ed61a2c5 100644 --- a/modules/flyte/README.md +++ b/modules/flyte/README.md @@ -53,7 +53,7 @@ No modules. | [kms\_info](#input\_kms\_info) | key\_id = KMS key id.
key\_arn = KMS key arn.
enabled = KMS key is enabled |
object({
key_id = string
key_arn = string
enabled = bool
})
| n/a | yes | | [platform\_namespace](#input\_platform\_namespace) | Name of Domino platform namespace for this deploy | `string` | n/a | yes | | [region](#input\_region) | AWS region for the deployment | `string` | n/a | yes | -| [serviceaccount\_names](#input\_serviceaccount\_names) | Service account names for Flyte |
object({
datacatalog = optional(string, "datacatalog")
flyteadmin = optional(string, "flyteadmin")
flytepropeller = optional(string, "flytepropeller")
})
| `{}` | no | +| [serviceaccount\_names](#input\_serviceaccount\_names) | Service account names for Flyte |
object({
datacatalog = optional(string, "datacatalog")
flyteadmin = optional(string, "flyteadmin")
flytepropeller = optional(string, "flytepropeller")
importer = optional(string, "domino-data-importer")
})
| `{}` | no | ## Outputs diff --git a/modules/flyte/iam.tf b/modules/flyte/iam.tf index 8f11f26c..ba77c66b 100644 --- a/modules/flyte/iam.tf +++ b/modules/flyte/iam.tf @@ -15,6 +15,7 @@ resource "aws_iam_role" "flyte_controlplane" { "${trimprefix(local.oidc_provider_url, "https://")}:sub" : [ "system:serviceaccount:${var.platform_namespace}:${var.serviceaccount_names.datacatalog}", "system:serviceaccount:${var.platform_namespace}:${var.serviceaccount_names.flytepropeller}", + "system:serviceaccount:${var.platform_namespace}:${var.serviceaccount_names.importer}", ] } } diff --git a/modules/flyte/variables.tf b/modules/flyte/variables.tf index 9536a831..645156c7 100644 --- a/modules/flyte/variables.tf +++ b/modules/flyte/variables.tf @@ -54,6 +54,7 @@ variable "serviceaccount_names" { datacatalog = optional(string, "datacatalog") flyteadmin = optional(string, "flyteadmin") flytepropeller = optional(string, "flytepropeller") + importer = optional(string, "domino-data-importer") }) default = {} diff --git a/modules/infra/submodules/storage/netapp.tf b/modules/infra/submodules/storage/netapp.tf index 1aff2860..adabad66 100644 --- a/modules/infra/submodules/storage/netapp.tf +++ b/modules/infra/submodules/storage/netapp.tf @@ -162,6 +162,8 @@ resource "aws_fsx_ontap_volume" "eks" { ontap_volume_type = "RW" copy_tags_to_backups = true volume_style = "FLEXVOL" + tags = local.backup_tagging + lifecycle { ignore_changes = [name, size_in_megabytes] # This volume is meant to be managed by the trident operator after initial creation. } diff --git a/modules/single-node/single-node.tf b/modules/single-node/single-node.tf index 4ac1cc9b..13bae7df 100644 --- a/modules/single-node/single-node.tf +++ b/modules/single-node/single-node.tf @@ -11,7 +11,7 @@ locals { "cluster" = var.eks_info.cluster.specs.name }, data.aws_default_tags.this.tags, local.node_labels) - kubelet_extra_args = "--kubelet-extra-args '--node-labels=${join(",", [for k, v in local.node_labels : format("%s=%s", k, v)])}'" + kubelet_extra_args = "--node-labels=${join(",", [for k, v in local.node_labels : format("%s=%s", k, v)])}" bootstrap_extra_args = join(" ", [local.kubelet_extra_args, var.single_node.bootstrap_extra_args]) } diff --git a/modules/single-node/templates/linux_user_data.tpl b/modules/single-node/templates/linux_user_data.tpl index 065a60d5..3b2a32ab 100644 --- a/modules/single-node/templates/linux_user_data.tpl +++ b/modules/single-node/templates/linux_user_data.tpl @@ -1,10 +1,11 @@ -#!/bin/bash -set -e -${pre_bootstrap_user_data ~} -%{ if length(cluster_service_ipv4_cidr) > 0 ~} -export SERVICE_IPV4_CIDR=${cluster_service_ipv4_cidr} -%{ endif ~} -B64_CLUSTER_CA=${cluster_auth_base64} -API_SERVER_URL=${cluster_endpoint} -/etc/eks/bootstrap.sh ${cluster_name} ${bootstrap_extra_args} --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL -${post_bootstrap_user_data ~} +--- +apiVersion: node.eks.aws/v1alpha1 +kind: NodeConfig +spec: + cluster: + name: ${cluster_name} + apiServerEndpoint: ${cluster_endpoint} + certificateAuthority: ${cluster_auth_base64} + cidr: ${cluster_service_ipv4_cidr} + kubelet: + flags: ["${bootstrap_extra_args}"] \ No newline at end of file diff --git a/tests/deploy/single-node/single-node.tfvars b/tests/deploy/single-node/single-node.tfvars index 395781ce..8a93d04c 100644 --- a/tests/deploy/single-node/single-node.tfvars +++ b/tests/deploy/single-node/single-node.tfvars @@ -2,12 +2,15 @@ single_node = { instance_type = "m6i.2xlarge" name = "dev-v2" ami = { - name_prefix = "dev-v2_" - owner = "977170443939" - + name_prefix = "amazon-eks-node-al2023-x86_64-standard-" + owner = "602401143452" } labels = { "dominodatalab.com/node-pool" = "default", "dominodatalab.com/domino-node" = "true" }, } + +eks = { + k8s_version = "1.30" +}