chore: remove e2e_slurm_preemption test series

The only test in this series was an e2e test sure that we caught slurm's SIGTERM, reported that to the master via an API call, and that training could would then see should_preempt() return True. This didn't need to be an e2e test; it can be a unit test to make sure our SIGTERM handler is working, plus rerouting the API logic to pass through the our normal preemption handling (#10071). There is already plenty of coverage ensuring should_preempt() works in python code. So by adding that unit test here and following the refactor of #10071, it is safe to remove the e2e_slurm_preemption series entirely. This is part of a larger effort to get rid of our znode tests, which are notoriously unreliable.
determined-ai · Oct 22, 2024 · 4523f18 · 4523f18
1 parent 1f2bea0
commit 4523f18
Show file tree

Hide file tree

Showing 9 changed files with 41 additions and 168 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -5351,14 +5351,6 @@ workflows:
             - package-and-push-system-local-ee
           extra-pytest-flags: "--no-compare-stats"
           collect-det-job-logs: false
-      - test-e2e-slurm:
-          name: test-e2e-slurm-preemption
-          context:
-            - dev-ci-cluster-default-user-credentials
-          mark: "e2e_slurm_preemption"
-          requires:
-            - package-and-push-system-local-ee
-          extra-pytest-flags: "--no-compare-stats"
       - test-e2e-slurm:
           name: test-e2e-slurm-znode
           context:

diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
@@ -21,7 +21,6 @@ markers =
     e2e_saml: tests for saml with okta
     e2e_slurm: end to end slurm integration tests
     e2e_slurm_restart: slurm integration tests that require restarting the master
-    e2e_slurm_preemption: hpc integration test to ensure preemption is working
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
     test_oauth: end to end test for oauth client, add, remove in EE.
     test_model_registry_rbac: end to end test for RBAC model registry.

diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py
@@ -184,62 +184,6 @@ def test_mnist_pytorch_distributed() -> None:
     exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)
 
 
-# Test to ensure that determined is able to handle preemption gracefully when using dispatcher RM.
-# Preemption:
-# When users launch a set of experiments requesting different levels of priorities
-# and resources, and when there are a limited set of resources, high priority experiments can
-# cancel or requeue low priority experiments.
-# Preemption is dependent upon the underlying HPC system and the WLM (SLURM/PBS) setup.
-# Nodes in an HPC systems are typically divided into multiple partitions (logical grouping of
-# nodes into possibly overlapping sets) used for different purposes. Using WLMs sysadmins
-# typically assign varying levels of priority for each partition. Also, users can request the
-# WLM to provide specific partition and priority level for their jobs.
-# In the following test case we test an example preemption scenario. We launch the two experiments
-# iris_tf_keras_cancellable and iris_tf_keras_high_priority in order. Ensure that the
-# iris_tf_keras_cancellable experiment is requeued, iris_tf_keras_high_priority experiment
-# runs to completion. After that, iris_tf_keras_cancellable experiment is resumed and it runs
-# to completion.
-# NB: The clusters casablanca-login and znode have one node (8-GPUs) being used in two partitions:
-#   1. defq_GPU_cancellable - partition for low priority and jobs are requeued if necessary
-#   2. defq_GPU_hipri - partition for high priority non-cancellable jobs
-@pytest.mark.e2e_slurm_preemption
-@api_utils.skipif_not_slurm()
-def test_slurm_preemption() -> None:
-    sess = api_utils.user_session()
-    # Launch the iris_tf_keras_cancellable experiment requesting 8 GPUs on defq_GPU_cancellable
-    # partition
-    cancelable_exp_id = exp.create_experiment(
-        sess,
-        conf.cv_examples_path("iris_tf_keras/iris_tf_keras_cancelable.yaml"),
-        conf.cv_examples_path("iris_tf_keras"),
-        None,
-    )
-    # Wait for the first cancellable experiment to enter RUNNING state.
-    exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING)
-    # Wait for the first cancellable experiment to complete at least one checkpoint.
-    exp.wait_for_at_least_one_checkpoint(sess, cancelable_exp_id, 300)
-    # Launch the iris_tf_keras_high_priority experiment requesting 8 GPUs on defq_GPU_hipri
-    # partition
-    high_priority_exp_id = exp.create_experiment(
-        sess,
-        conf.cv_examples_path("iris_tf_keras/iris_tf_keras_high_priority.yaml"),
-        conf.cv_examples_path("iris_tf_keras"),
-        None,
-    )
-    # In this scenario, iris_tf_keras_high_priority experiment will cause the
-    # iris_tf_keras_cancelable experiment to get requeued. The experiment
-    # iris_tf_keras_high_priority will execute to completion.
-    exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.QUEUED)
-    exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.RUNNING)
-    exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.COMPLETED)
-    # Now, the experiment iris_tf_keras_cancelable will resume as soon as the requested
-    # resources are available.
-    exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING)
-    # Finally, the experiment iris_tf_keras_cancelable will complete if there are no other
-    # interruptions.
-    exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.COMPLETED)
-
-
 @pytest.mark.e2e_slurm
 @pytest.mark.e2e_pbs
 @api_utils.skipif_not_hpc()

diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py
@@ -35,7 +35,6 @@
     "e2e_saml",
     "e2e_slurm",
     "e2e_slurm_restart",
-    "e2e_slurm_preemption",
     "e2e_slurm_internet_connected_cluster",
     "det_deploy_local",
     "test_oauth",

diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml
diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml
diff --git a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_cancelable.yaml b/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_cancelable.yaml
diff --git a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_high_priority.yaml b/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_high_priority.yaml
diff --git a/harness/tests/launch/test_launch.py b/harness/tests/launch/test_launch.py
@@ -1,4 +1,7 @@
+import os
 import runpy
+import signal
+import time
 from typing import Any, Dict, List
 from unittest import mock
 
@@ -55,3 +58,41 @@ def test_launch_script(mock_validate_config: mock.MagicMock) -> None:
             with pytest.raises(SystemExit) as e:
                 runpy.run_module("determined.exec.launch", run_name="__main__", alter_sys=True)
             assert e.value.code == 1, e
+
+
+@mock.patch("subprocess.Popen")
+@mock.patch("determined.common.api.authentication.login_from_task")
+def test_launch_catches_slurm_preemption(
+    mock_login_from_task: mock.MagicMock,
+    mock_popen: mock.MagicMock,
+) -> None:
+    """
+    Determined's slurm preemption logic involves catching the SIGTERM sent by slurm, reporting it
+    to the master, then the python code will detect that through the normal should_preempt() logic.
+
+    Here, we need to test that the launch script actually catches SIGTERM and makes the right API
+    call to the master.
+    """
+
+    # Send a SIGTERM to this process during the Popen.wait().
+    def wait_side_effect() -> int:
+        os.kill(os.getpid(), signal.SIGTERM)
+        # Make any syscall at all to ensure the signal handler has a good chance to fire.
+        time.sleep(0.000001)
+        # SIGTERM handler should have fired, make sure the right API call was made.
+        mock_login_from_task.return_value.post.assert_called_once_with(
+            "/api/v1/allocations/allocationId/signals/pending_preemption"
+        )
+        # Return a unique value to make sure our mocks are wired up right.
+        return 789
+
+    mock_popen.return_value.wait.side_effect = wait_side_effect
+
+    old_handler = signal.getsignal(signal.SIGTERM)
+    try:
+        with test_util.set_env_vars({"DET_RESOURCES_TYPE": "slurm-job"}):
+            with test_util.set_mock_cluster_info(["0.0.0.1"], 0, 1):
+                config = {"entrypoint": "yo"}
+                assert launch.launch(det.ExperimentConfig(config)) == 789
+    finally:
+        signal.signal(signal.SIGTERM, old_handler)