From 188208370c3b1fe14302500913625f7d65a80258 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Tue, 10 Jan 2023 00:43:45 +0100
Subject: [PATCH] [CI] Deflake test_reserved_cpu_warnings (again) (#31535)

Attempt 2 at deflaking test_reserved_cpu_warnings. This time, we mock ray.available_resources() (used in the warning logic) to always return cluster resources (aka the expected state). This removes the stochastic nature of ray.available_resources() as we may have leftover tasks/actors that have not been cleaned up in time for the next fit call. The previous attempt used gc.collect but it didn't solve the problem entirely.

This is a blanket mock of ray.available_resources. At this moment, this function is only used in Tune for the warning logic. A safer approach would be to mock it just in the TunerInternal._maybe_warn_resource_contention method, but that would introduce a lot more patching code here.

Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
---
 python/ray/train/tests/test_base_trainer.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/ray/train/tests/test_base_trainer.py b/python/ray/train/tests/test_base_trainer.py
index 4d2b29199a31..f864f1a47924 100644
--- a/python/ray/train/tests/test_base_trainer.py
+++ b/python/ray/train/tests/test_base_trainer.py
@@ -1,4 +1,3 @@
-import gc
 import io
 import logging
 import os
@@ -236,10 +235,12 @@ def train_loop(self):
     tune.run(trainer.as_trainable(), num_samples=4)
 
 
+@patch("ray.available_resources", ray.cluster_resources)
 def test_reserved_cpu_warnings(ray_start_4_cpus, mock_tuner_internal_logger):
-    # We use gc.collect in this test to ensure that all
-    # Ray actors & tasks are terminated in between .fit() calls,
-    # as the warning condition checks for available Ray resources.
+    # ray.available_resources() is used in the warning logic.
+    # We mock it as it can be stochastic due to garbage collection etc.
+    # The aim of this test is not to check if ray.available_resources()
+    # works correctly, but to test the warning logic.
 
     def train_loop(config):
         pass
@@ -251,7 +252,6 @@ def train_loop(config):
         datasets={"train": ray.data.range(10)},
     )
     trainer.fit()
-    gc.collect()
     assert not mock_tuner_internal_logger.warnings
 
     # No datasets, no fraction.
@@ -260,7 +260,6 @@ def train_loop(config):
         scaling_config=ScalingConfig(num_workers=1),
     )
     trainer.fit()
-    gc.collect()
     assert not mock_tuner_internal_logger.warnings
 
     # Should warn.
@@ -270,7 +269,6 @@ def train_loop(config):
         datasets={"train": ray.data.range(10)},
     )
     trainer.fit()
-    gc.collect()
     assert (
         len(mock_tuner_internal_logger.warnings) == 1
     ), mock_tuner_internal_logger.warnings
@@ -285,7 +283,6 @@ def train_loop(config):
     )
     tuner = tune.Tuner(trainer, tune_config=tune.TuneConfig(num_samples=3))
     tuner.fit()
-    gc.collect()
     assert (
         len(mock_tuner_internal_logger.warnings) == 1
     ), mock_tuner_internal_logger.warnings
@@ -300,7 +297,6 @@ def train_loop(config):
     )
     tuner = tune.Tuner(trainer, tune_config=tune.TuneConfig(num_samples=3))
     tuner.fit()
-    gc.collect()
     assert not mock_tuner_internal_logger.warnings
 
     # Don't warn if Trainer is not used