From 188208370c3b1fe14302500913625f7d65a80258 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 10 Jan 2023 00:43:45 +0100 Subject: [PATCH] [CI] Deflake test_reserved_cpu_warnings (again) (#31535) Attempt 2 at deflaking test_reserved_cpu_warnings. This time, we mock ray.available_resources() (used in the warning logic) to always return cluster resources (aka the expected state). This removes the stochastic nature of ray.available_resources() as we may have leftover tasks/actors that have not been cleaned up in time for the next fit call. The previous attempt used gc.collect but it didn't solve the problem entirely. This is a blanket mock of ray.available_resources. At this moment, this function is only used in Tune for the warning logic. A safer approach would be to mock it just in the TunerInternal._maybe_warn_resource_contention method, but that would introduce a lot more patching code here. Signed-off-by: Antoni Baum --- python/ray/train/tests/test_base_trainer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/python/ray/train/tests/test_base_trainer.py b/python/ray/train/tests/test_base_trainer.py index 4d2b29199a31..f864f1a47924 100644 --- a/python/ray/train/tests/test_base_trainer.py +++ b/python/ray/train/tests/test_base_trainer.py @@ -1,4 +1,3 @@ -import gc import io import logging import os @@ -236,10 +235,12 @@ def train_loop(self): tune.run(trainer.as_trainable(), num_samples=4) +@patch("ray.available_resources", ray.cluster_resources) def test_reserved_cpu_warnings(ray_start_4_cpus, mock_tuner_internal_logger): - # We use gc.collect in this test to ensure that all - # Ray actors & tasks are terminated in between .fit() calls, - # as the warning condition checks for available Ray resources. + # ray.available_resources() is used in the warning logic. + # We mock it as it can be stochastic due to garbage collection etc. + # The aim of this test is not to check if ray.available_resources() + # works correctly, but to test the warning logic. def train_loop(config): pass @@ -251,7 +252,6 @@ def train_loop(config): datasets={"train": ray.data.range(10)}, ) trainer.fit() - gc.collect() assert not mock_tuner_internal_logger.warnings # No datasets, no fraction. @@ -260,7 +260,6 @@ def train_loop(config): scaling_config=ScalingConfig(num_workers=1), ) trainer.fit() - gc.collect() assert not mock_tuner_internal_logger.warnings # Should warn. @@ -270,7 +269,6 @@ def train_loop(config): datasets={"train": ray.data.range(10)}, ) trainer.fit() - gc.collect() assert ( len(mock_tuner_internal_logger.warnings) == 1 ), mock_tuner_internal_logger.warnings @@ -285,7 +283,6 @@ def train_loop(config): ) tuner = tune.Tuner(trainer, tune_config=tune.TuneConfig(num_samples=3)) tuner.fit() - gc.collect() assert ( len(mock_tuner_internal_logger.warnings) == 1 ), mock_tuner_internal_logger.warnings @@ -300,7 +297,6 @@ def train_loop(config): ) tuner = tune.Tuner(trainer, tune_config=tune.TuneConfig(num_samples=3)) tuner.fit() - gc.collect() assert not mock_tuner_internal_logger.warnings # Don't warn if Trainer is not used