Skip to content

Commit

Permalink
fix: DET-9483 successfully run e2e_slurm_preemption tests as part of …
Browse files Browse the repository at this point in the history
…nightly workflow (#903)

[e2e_tests changes only]
  • Loading branch information
jagadeesh545 authored and rb-determined-ai committed Nov 2, 2023
1 parent 4f6277d commit 6b34e0d
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ markers =
nightly_quarantine: nightly tests (quarantine)
distributed_quarantine: distributed training tests (quarantine)
det_deploy_local_quarantine: test det deploy local (quarantine)
e2e_slurm_preemption_quarantine: hpc integration test to ensure preemption is working (quarantine)


junit_logging = all
Expand Down
4 changes: 3 additions & 1 deletion e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_cifar10_pytorch_distributed() -> None:
# NB: The clusters casablanca-login and znode have one node (8-GPUs) being used in two partitions:
# 1. defq_GPU_cancellable - partition for low priority and jobs are requeued if necessary
# 2. defq_GPU_hipri - partition for high priority non-cancellable jobs
@pytest.mark.e2e_slurm_preemption_quarantine
@pytest.mark.e2e_slurm_preemption
def test_slurm_preemption() -> None:
# Launch the cifar10_pytorch_cancellable experiment requesting 8 GPUs on defq_GPU_cancellable
# partition
Expand All @@ -183,6 +183,8 @@ def test_slurm_preemption() -> None:
conf.cv_examples_path("cifar10_pytorch"),
None,
)
# Wait for the first cancellable experiment to enter RUNNING state.
exp.wait_for_experiment_state(cancelable_exp_id, experimentv1State.RUNNING)
# Wait for the first cancellable experiment to complete at least one checkpoint.
exp.wait_for_at_least_one_checkpoint(cancelable_exp_id, 300)
# Launch the cifar10_pytorch_high_priority experiment requesting 8 GPUs on defq_GPU_hipri
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
"nightly_quarantine",
"distributed_quarantine",
"det_deploy_local_quarantine",
"e2e_slurm_preemption_quarantine",
}


Expand Down

0 comments on commit 6b34e0d

Please sign in to comment.