From 4714fd1c8a203cd75d1b50fb60d16a71376780de Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Mon, 13 May 2024 15:10:57 -0700 Subject: [PATCH] Skip EFA tests for PT 1.13 (#3908) * Skip EFA tests for PT 1.13 * formatting * fix comment * use pt201 and above fixture * test 1.13 * revert toml * revert toml * revert gdrcopy skip * revert unused efa import * revert gdrcopy skip * test sagemaker only fixture * fix comments * revert toml * split ec2 sm buildspec * split ec2 sm * add sm buildspec --------- Co-authored-by: arjkesh <33526713+arjkesh@users.noreply.github.com> --- ...ldspec-1-13.yml => buildspec-1-13-ec2.yml} | 30 --------- pytorch/training/buildspec-1-13-sm.yml | 64 +++++++++++++++++++ test/dlc_tests/ec2/test_efa.py | 2 + test/dlc_tests/ec2/test_gdrcopy.py | 1 + .../integration/sagemaker/test_pytorchddp.py | 2 +- 5 files changed, 68 insertions(+), 31 deletions(-) rename pytorch/training/{buildspec-1-13.yml => buildspec-1-13-ec2.yml} (70%) create mode 100644 pytorch/training/buildspec-1-13-sm.yml diff --git a/pytorch/training/buildspec-1-13.yml b/pytorch/training/buildspec-1-13-ec2.yml similarity index 70% rename from pytorch/training/buildspec-1-13.yml rename to pytorch/training/buildspec-1-13-ec2.yml index 0c18bc19c22c..609656fd240f 100644 --- a/pytorch/training/buildspec-1-13.yml +++ b/pytorch/training/buildspec-1-13-ec2.yml @@ -62,36 +62,6 @@ images: target: ec2 context: <<: *TRAINING_CONTEXT - BuildSageMakerCPUPTTrainPy3DockerImage: - <<: *TRAINING_REPOSITORY - build: &PYTORCH_CPU_TRAINING_PY3 false - image_size_baseline: 6200 - device_type: &DEVICE_TYPE cpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py39 - os_version: &OS_VERSION ubuntu20.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] - latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] - target: sagemaker - context: - <<: *TRAINING_CONTEXT - BuildSageMakerGPUPTTrainPy3DockerImage: - <<: *TRAINING_REPOSITORY - build: &PYTORCH_GPU_TRAINING_PY3 false - image_size_baseline: 14000 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py39 - cuda_version: &CUDA_VERSION cu117 - os_version: &OS_VERSION ubuntu20.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., - *DEVICE_TYPE ] - target: sagemaker - context: - <<: *TRAINING_CONTEXT # BuildPyTorchExampleGPUTrainPy3DockerImage: # <<: *TRAINING_REPOSITORY # build: &PYTORCH_GPU_TRAINING_PY3 false diff --git a/pytorch/training/buildspec-1-13-sm.yml b/pytorch/training/buildspec-1-13-sm.yml new file mode 100644 index 000000000000..cffcf8673606 --- /dev/null +++ b/pytorch/training/buildspec-1-13-sm.yml @@ -0,0 +1,64 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 1.13.1 +short_version: &SHORT_VERSION "1.13" +arch_type: x86 +autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + BuildSageMakerCPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 6200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py39 + os_version: &OS_VERSION ubuntu20.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 14000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py39 + cuda_version: &CUDA_VERSION cu117 + os_version: &OS_VERSION ubuntu20.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT diff --git a/test/dlc_tests/ec2/test_efa.py b/test/dlc_tests/ec2/test_efa.py index 28e5d97dbacb..06ab3280ca71 100644 --- a/test/dlc_tests/ec2/test_efa.py +++ b/test/dlc_tests/ec2/test_efa.py @@ -47,6 +47,8 @@ ) +# NOTE: Skip EFA for PT1.13 since it is not currently supported +@pytest.mark.usefixtures("pt201_and_above_only") @pytest.mark.processor("gpu") @pytest.mark.model("N/A") @pytest.mark.integration("efa") diff --git a/test/dlc_tests/ec2/test_gdrcopy.py b/test/dlc_tests/ec2/test_gdrcopy.py index a1160c7fde3f..8a7e3d174930 100644 --- a/test/dlc_tests/ec2/test_gdrcopy.py +++ b/test/dlc_tests/ec2/test_gdrcopy.py @@ -54,6 +54,7 @@ def test_gdrcopy( pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) + # NOTE: Skip PT 1.13 EC2 GDRCopy tests until it is installed. _, framework_version = test_utils.get_framework_and_version_from_tag(pytorch_training) framework_version = Version(framework_version) if test_utils.is_ec2_image(pytorch_training) and framework_version == Version("1.13.1"): diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py index e5b80b33587a..9f69a3cc24b7 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py @@ -57,7 +57,7 @@ def can_run_pytorchddp(ecr_image): def test_pytorchddp_throughput_gpu( framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir ): - with timeout(minutes=25): + with timeout(minutes=40): validate_or_skip_pytorchddp(ecr_image) distribution = {"pytorchddp": {"enabled": True}} estimator_parameter = {