Skip EFA tests for PT 1.13 (#3908)

* Skip EFA tests for PT 1.13 * formatting * fix comment * use pt201 and above fixture * test 1.13 * revert toml * revert toml * revert gdrcopy skip * revert unused efa import * revert gdrcopy skip * test sagemaker only fixture * fix comments * revert toml * split ec2 sm buildspec * split ec2 sm * add sm buildspec --------- Co-authored-by: arjkesh <[email protected]>
aws · May 13, 2024 · 4714fd1 · 4714fd1
1 parent 3a118b1
commit 4714fd1
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 31 deletions.
diff --git a/pytorch/training/buildspec-1-13.yml → pytorch/training/buildspec-1-13-ec2.yml b/pytorch/training/buildspec-1-13.yml → pytorch/training/buildspec-1-13-ec2.yml
@@ -62,36 +62,6 @@ images:
     target: ec2
     context:
       <<: *TRAINING_CONTEXT
-  BuildSageMakerCPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_CPU_TRAINING_PY3 false
-    image_size_baseline: 6200
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py39
-    os_version: &OS_VERSION ubuntu20.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildSageMakerGPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_GPU_TRAINING_PY3 false
-    image_size_baseline: 14000
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py39
-    cuda_version: &CUDA_VERSION cu117
-    os_version: &OS_VERSION ubuntu20.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-                         *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
   # BuildPyTorchExampleGPUTrainPy3DockerImage:
   #   <<: *TRAINING_REPOSITORY
   #   build: &PYTORCH_GPU_TRAINING_PY3 false

diff --git a/pytorch/training/buildspec-1-13-sm.yml b/pytorch/training/buildspec-1-13-sm.yml
@@ -0,0 +1,64 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 1.13.1
+short_version: &SHORT_VERSION "1.13"
+arch_type: x86
+autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildSageMakerCPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6200
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py39
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildSageMakerGPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 14000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py39
+    cuda_version: &CUDA_VERSION cu117
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
@@ -47,6 +47,8 @@
 )
 
 
+# NOTE: Skip EFA for PT1.13 since it is not currently supported
+@pytest.mark.usefixtures("pt201_and_above_only")
 @pytest.mark.processor("gpu")
 @pytest.mark.model("N/A")
 @pytest.mark.integration("efa")

diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py
@@ -54,6 +54,7 @@ def test_gdrcopy(
         pytest.skip(
             f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
         )
+    # NOTE: Skip PT 1.13 EC2 GDRCopy tests until it is installed.
     _, framework_version = test_utils.get_framework_and_version_from_tag(pytorch_training)
     framework_version = Version(framework_version)
     if test_utils.is_ec2_image(pytorch_training) and framework_version == Version("1.13.1"):

diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py
@@ -57,7 +57,7 @@ def can_run_pytorchddp(ecr_image):
 def test_pytorchddp_throughput_gpu(
     framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
 ):
-    with timeout(minutes=25):
+    with timeout(minutes=40):
         validate_or_skip_pytorchddp(ecr_image)
         distribution = {"pytorchddp": {"enabled": True}}
         estimator_parameter = {