[Core] Deflake test_advanced_9 (ray-project#34410)

Looks like gcs server proc doesn't go back to original num_fds; it goes lower. output from my machine: >> 222 # before starting worker procs (A pid=28851) HELLO ['WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD', 'WORLD'] >> 250 # with worker procs >> 217 >> 216 >> 213 >> 212 >> 207 >> 206 # after work procs die. >> 206 >> 208 # Not sure why it goes up again >> 208 # Remains at 208, times out This PR deflakes the test, but I don't know enough about gcs server to say if this is a good fix or not. Signed-off-by: elliottower <[email protected]>
elliottower · Apr 22, 2023 · ca64a29 · ca64a29
1 parent 0aa2ee8
commit ca64a29
Showing 1 changed file with 11 additions and 11 deletions.
diff --git a/python/ray/tests/test_advanced_9.py b/python/ray/tests/test_advanced_9.py
@@ -280,7 +280,7 @@ def get_gcs_num_of_connections():
 
     time.sleep(10)
 
-    curr_fds = get_gcs_num_of_connections()
+    fds_without_workers = get_gcs_num_of_connections()
 
     @ray.remote
     class A:
@@ -289,27 +289,27 @@ def ready(self):
             return "WORLD"
 
     num_of_actors = 10
-    a = [A.remote() for _ in range(num_of_actors)]
-    print(ray.get([t.ready.remote() for t in a]))
+    actors = [A.remote() for _ in range(num_of_actors)]
+    print(ray.get([t.ready.remote() for t in actors]))
 
-    # Kill the actor
-    del a
+    # Kill the actors
+    del actors
 
-    # TODO(clarng):remove this once prestart works with actors.
-    # ray_start_cluster defaults to one cpu, which prestarts one worker.
-    FD_PER_WORKER = 2
     # Make sure the # of fds opened by the GCS dropped.
-    wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER == curr_fds)
+    # This assumes worker processes are not created after the actor worker
+    # processes die.
+    wait_for_condition(lambda: get_gcs_num_of_connections() <= fds_without_workers)
+    num_fds_after_workers_die = get_gcs_num_of_connections()
 
     n = cluster.add_node(wait=True)
 
     # Make sure the # of fds opened by the GCS increased.
-    wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER > curr_fds)
+    wait_for_condition(lambda: get_gcs_num_of_connections() > num_fds_after_workers_die)
 
     cluster.remove_node(n)
 
     # Make sure the # of fds opened by the GCS dropped.
-    wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER == curr_fds)
+    wait_for_condition(lambda: get_gcs_num_of_connections() <= fds_without_workers)
 
 
 @pytest.mark.parametrize(