Skip to content

Commit

Permalink
Skip EFA tests for PT 1.13 (#3908)
Browse files Browse the repository at this point in the history
* Skip EFA tests for PT 1.13

* formatting

* fix comment

* use pt201 and above fixture

* test 1.13

* revert toml

* revert toml

* revert gdrcopy skip

* revert unused efa import

* revert gdrcopy skip

* test sagemaker only fixture

* fix comments

* revert toml

* split ec2 sm buildspec

* split ec2 sm

* add sm buildspec

---------

Co-authored-by: arjkesh <[email protected]>
  • Loading branch information
sirutBuasai and arjkesh authored May 13, 2024
1 parent 3a118b1 commit 4714fd1
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,36 +62,6 @@ images:
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 6200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py39
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 14000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py39
cuda_version: &CUDA_VERSION cu117
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
# BuildPyTorchExampleGPUTrainPy3DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_GPU_TRAINING_PY3 false
Expand Down
64 changes: 64 additions & 0 deletions pytorch/training/buildspec-1-13-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 1.13.1
short_version: &SHORT_VERSION "1.13"
arch_type: x86
autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 6200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py39
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 14000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py39
cuda_version: &CUDA_VERSION cu117
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
2 changes: 2 additions & 0 deletions test/dlc_tests/ec2/test_efa.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
)


# NOTE: Skip EFA for PT1.13 since it is not currently supported
@pytest.mark.usefixtures("pt201_and_above_only")
@pytest.mark.processor("gpu")
@pytest.mark.model("N/A")
@pytest.mark.integration("efa")
Expand Down
1 change: 1 addition & 0 deletions test/dlc_tests/ec2/test_gdrcopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def test_gdrcopy(
pytest.skip(
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
)
# NOTE: Skip PT 1.13 EC2 GDRCopy tests until it is installed.
_, framework_version = test_utils.get_framework_and_version_from_tag(pytorch_training)
framework_version = Version(framework_version)
if test_utils.is_ec2_image(pytorch_training) and framework_version == Version("1.13.1"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def can_run_pytorchddp(ecr_image):
def test_pytorchddp_throughput_gpu(
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
):
with timeout(minutes=25):
with timeout(minutes=40):
validate_or_skip_pytorchddp(ecr_image)
distribution = {"pytorchddp": {"enabled": True}}
estimator_parameter = {
Expand Down

0 comments on commit 4714fd1

Please sign in to comment.