Skip to content

Commit

Permalink
ci: FE-133 Configure non agent slurm/pbs tests to skip without explic…
Browse files Browse the repository at this point in the history
…itly listing test names in circleci. (#977)

* [ALLGCP] fix: trying to figure out home issue. updated docs

* [ALLGCP] fix: removed temp changes

* [ALLGCP] fix: trying to figure out home issue. updated docs

* [ALLGCP] fix: removed temp changes

* [ALLGCP] fix: merge with pending_reason test

* [ALLGCP] fix: merge error

* [ALLGCP] ci: added the agent skip to pending reason test and removed flag from pytest

* [ALLGCP] added skip flag back into znode agent test

* [ALLGCP] ci: moved skip method to experiment.py. Added more detailed error message

* [ALLGCP] chore: changed method name to be consistent with rest

[e2e_tests changes only]
  • Loading branch information
CharlesTran1 authored and rb-determined-ai committed Nov 2, 2023
1 parent 833c05c commit 8dcd856
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
31 changes: 31 additions & 0 deletions e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,33 @@
import pytest
import torch

from determined.common.api import bindings
from determined.common.api.bindings import experimentv1State, trialv1State
from tests import api_utils
from tests import config as conf
from tests import experiment as exp


# Queries the determined master for resource pool information to determine if agent is used
# Currently we are assuming that all resource pools are of the same scheduler type
# which is why only the first resource pool's type is checked.
def skip_if_not_hpc_scheduler() -> None:
sess = api_utils.determined_test_session()
resourcePool = bindings.get_GetResourcePools(sess).resourcePools
if resourcePool:
schedulerType = resourcePool[0].schedulerType
else:
pytest.fail("ERROR: Resource Pool returned no value. Make sure the resource pool is set.")
if (
schedulerType != bindings.v1SchedulerType.SLURM
and schedulerType != bindings.v1SchedulerType.PBS
):
errorMessage = "Agent is not compatible with the test. Scheduler type: " + str(
schedulerType
)
pytest.skip(errorMessage)


def run_failure_test_multiple(config_file: str, model_def_file: str, errors: List[str]) -> int:
# Creates an experiment meant to fail and checks array of error messages
# If one of the errors are present, then the assertion passes
Expand Down Expand Up @@ -49,6 +71,11 @@ def test_unsupported_option() -> None:
# run_failure_test expects the experiment to fail and will assert the log with the string
# Queries the logs for the error call
# Waits for experiment to reach a ERROR_STATE. Errors if it does not error

# This test is skipped when the determined agent is used.
# The determined agent does not fail properly and ignores the bad option
skip_if_not_hpc_scheduler()

exp.run_failure_test(
conf.fixtures_path("failures/unsupported-slurm-option.yaml"),
conf.fixtures_path("failures/"),
Expand Down Expand Up @@ -113,6 +140,10 @@ def test_bad_slurm_option() -> None:
# Creates an experiment that uses an invalid slurm option.
# Only casablanca displays the SBATCH options. Horizon does not upon failure
# The line: "SBATCH options:" is not present on horizon's output

# This test is skipped when the determined agent is used.
# The determined agent does not fail properly and ignores the bad option
skip_if_not_hpc_scheduler()
bad_option_helper("failures/bad-slurm-option.yaml", "failures/", "sbatch: unrecognized option")


Expand Down
6 changes: 6 additions & 0 deletions e2e_tests/tests/experiment/test_pending_hpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from tests import config as conf
from tests import experiment as exp

from ..cluster import test_slurm


@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
Expand All @@ -17,6 +19,10 @@ def test_hpc_job_pending_reason() -> None:
# TotalAvailableGpuSlots:0 TotalNodes:1 TotalGpuSlots:0
# TotalAvailableCPUSlots:8 TotalCPUSlots:8 Accelerator:}]

# Currently, this test fails while using the determined agent.
# The output is PBS or SLURM launcher specific
test_slurm.skip_if_not_hpc_scheduler()

config = conf.load_config(conf.cv_examples_path("cifar10_pytorch/const.yaml"))
config = conf.set_max_length(config, {"batches": 200})
config = conf.set_slots_per_trial(config, 1)
Expand Down

0 comments on commit 8dcd856

Please sign in to comment.