From 9316ce7475f50cf4e695730155420c38602c99ba Mon Sep 17 00:00:00 2001 From: Yi Cheng <74173148+iycheng@users.noreply.github.com> Date: Wed, 10 May 2023 22:33:18 -0700 Subject: [PATCH] [core] Deflakey test advanced 9 (#35247) Previously a bug was fixed in #33311 where pubsub causes the leak. Somehow the fix has race conditions and got triggered later when code changes. The test is flakey because there is a race condition between raylet sending node failure and core worker exit itself. When disconnect is sent to Raylet, Raylet will start to report worker failure. But the worker still continue to run. GCS uses worker failure to close the connection. But if the worker is still alive, the worker might send another request the GCS which will lead to the FD leak. Compare with #34883 it's a short term fix and the goal is to make the case the same as 2.3. --- src/ray/core_worker/core_worker.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index ff03b5b85508..5f1b1c3ea9d9 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -790,8 +790,12 @@ void CoreWorker::Exit( detail = std::move(detail), creation_task_exception_pb_bytes]() { rpc::DrainServerCallExecutor(); - Disconnect(exit_type, detail, creation_task_exception_pb_bytes); KillChildProcs(); + // Disconnect should be put close to Shutdown + // https://github.com/ray-project/ray/pull/34883 + // TODO (iycheng) Improve the Process.h and make it able to monitor + // process liveness + Disconnect(exit_type, detail, creation_task_exception_pb_bytes); Shutdown(); }, "CoreWorker.Shutdown"); @@ -835,9 +839,13 @@ void CoreWorker::ForceExit(const rpc::WorkerExitType exit_type, const std::string &detail) { RAY_LOG(WARNING) << "Force exit the process. " << " Details: " << detail; - Disconnect(exit_type, detail); KillChildProcs(); + // Disconnect should be put close to Exit + // https://github.com/ray-project/ray/pull/34883 + // TODO (iycheng) Improve the Process.h and make it able to monitor + // process liveness + Disconnect(exit_type, detail); // NOTE(hchen): Use `QuickExit()` to force-exit this process without doing cleanup. // `exit()` will destruct static objects in an incorrect order, which will lead to