From 4714fd1c8a203cd75d1b50fb60d16a71376780de Mon Sep 17 00:00:00 2001
From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com>
Date: Mon, 13 May 2024 15:10:57 -0700
Subject: [PATCH] Skip EFA tests for PT 1.13 (#3908)

* Skip EFA tests for PT 1.13

* formatting

* fix comment

* use pt201 and above fixture

* test 1.13

* revert toml

* revert toml

* revert gdrcopy skip

* revert unused efa import

* revert gdrcopy skip

* test sagemaker only fixture

* fix comments

* revert toml

* split ec2 sm buildspec

* split ec2 sm

* add sm buildspec

---------

Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com>
---
 ...ldspec-1-13.yml => buildspec-1-13-ec2.yml} | 30 ---------
 pytorch/training/buildspec-1-13-sm.yml        | 64 +++++++++++++++++++
 test/dlc_tests/ec2/test_efa.py                |  2 +
 test/dlc_tests/ec2/test_gdrcopy.py            |  1 +
 .../integration/sagemaker/test_pytorchddp.py  |  2 +-
 5 files changed, 68 insertions(+), 31 deletions(-)
 rename pytorch/training/{buildspec-1-13.yml => buildspec-1-13-ec2.yml} (70%)
 create mode 100644 pytorch/training/buildspec-1-13-sm.yml

diff --git a/pytorch/training/buildspec-1-13.yml b/pytorch/training/buildspec-1-13-ec2.yml
similarity index 70%
rename from pytorch/training/buildspec-1-13.yml
rename to pytorch/training/buildspec-1-13-ec2.yml
index 0c18bc19c22c..609656fd240f 100644
--- a/pytorch/training/buildspec-1-13.yml
+++ b/pytorch/training/buildspec-1-13-ec2.yml
@@ -62,36 +62,6 @@ images:
     target: ec2
     context:
       <<: *TRAINING_CONTEXT
-  BuildSageMakerCPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_CPU_TRAINING_PY3 false
-    image_size_baseline: 6200
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py39
-    os_version: &OS_VERSION ubuntu20.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildSageMakerGPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_GPU_TRAINING_PY3 false
-    image_size_baseline: 14000
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py39
-    cuda_version: &CUDA_VERSION cu117
-    os_version: &OS_VERSION ubuntu20.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-                         *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
   # BuildPyTorchExampleGPUTrainPy3DockerImage:
   #   <<: *TRAINING_REPOSITORY
   #   build: &PYTORCH_GPU_TRAINING_PY3 false
diff --git a/pytorch/training/buildspec-1-13-sm.yml b/pytorch/training/buildspec-1-13-sm.yml
new file mode 100644
index 000000000000..cffcf8673606
--- /dev/null
+++ b/pytorch/training/buildspec-1-13-sm.yml
@@ -0,0 +1,64 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 1.13.1
+short_version: &SHORT_VERSION "1.13"
+arch_type: x86
+autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  BuildSageMakerCPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_CPU_TRAINING_PY3 false
+    image_size_baseline: 6200
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py39
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildSageMakerGPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 14000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py39
+    cuda_version: &CUDA_VERSION cu117
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py
index 28e5d97dbacb..06ab3280ca71 100644
--- a/test/dlc_tests/ec2/test_efa.py
+++ b/test/dlc_tests/ec2/test_efa.py
@@ -47,6 +47,8 @@
 )
 
 
+# NOTE: Skip EFA for PT1.13 since it is not currently supported
+@pytest.mark.usefixtures("pt201_and_above_only")
 @pytest.mark.processor("gpu")
 @pytest.mark.model("N/A")
 @pytest.mark.integration("efa")
diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py
index a1160c7fde3f..8a7e3d174930 100644
--- a/test/dlc_tests/ec2/test_gdrcopy.py
+++ b/test/dlc_tests/ec2/test_gdrcopy.py
@@ -54,6 +54,7 @@ def test_gdrcopy(
         pytest.skip(
             f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
         )
+    # NOTE: Skip PT 1.13 EC2 GDRCopy tests until it is installed.
     _, framework_version = test_utils.get_framework_and_version_from_tag(pytorch_training)
     framework_version = Version(framework_version)
     if test_utils.is_ec2_image(pytorch_training) and framework_version == Version("1.13.1"):
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py
index e5b80b33587a..9f69a3cc24b7 100644
--- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py
+++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py
@@ -57,7 +57,7 @@ def can_run_pytorchddp(ecr_image):
 def test_pytorchddp_throughput_gpu(
     framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
 ):
-    with timeout(minutes=25):
+    with timeout(minutes=40):
         validate_or_skip_pytorchddp(ecr_image)
         distribution = {"pytorchddp": {"enabled": True}}
         estimator_parameter = {