-
Notifications
You must be signed in to change notification settings - Fork 11.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Make a dedicated thread for the RPC server #111210
Conversation
Summary: Make a separate thread to run the server when we launch. This is required by CUDA, which you can force with `export CUDA_LAUNCH_BLOCKING=1`. I figured I might as well be consistent and do it for the AMD implementation as well even though I believe it's not necessary.
@llvm/pr-subscribers-libc @llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/111210.diff 2 Files Affected:
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index ca13414519d4ca..cb81a866622f93 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -28,9 +28,11 @@
#include "hsa/hsa_ext_amd.h"
#endif
+#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <cstring>
+#include <thread>
#include <tuple>
#include <utility>
@@ -289,18 +291,26 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+ std::atomic<bool> finished = false;
+ std::thread server(
+ [](std::atomic<bool> *finished, rpc_device_t device) {
+ while (!*finished) {
+ if (rpc_status_t err = rpc_handle_server(device))
+ handle_error(err);
+ }
+ },
+ &finished, device);
+
// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
- while (hsa_signal_wait_scacquire(
- packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
- /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
- if (rpc_status_t err = rpc_handle_server(device))
- handle_error(err);
+ while (hsa_signal_wait_scacquire(packet->completion_signal,
+ HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+ HSA_WAIT_STATE_BLOCKED) != 0)
+ ;
- // Handle the server one more time in case the kernel exited with a pending
- // send still in flight.
- if (rpc_status_t err = rpc_handle_server(device))
- handle_error(err);
+ finished = true;
+ if (server.joinable())
+ server.join();
// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
diff --git a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
index 1b210b9e7a896f..58e5e5f04d0a70 100644
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -20,10 +20,12 @@
#include "llvm/Object/ELF.h"
#include "llvm/Object/ELFObjectFile.h"
+#include <atomic>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
+#include <thread>
#include <vector>
using namespace llvm;
@@ -224,6 +226,16 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
if (print_resource_usage)
print_kernel_resources(binary, kernel_name);
+ std::atomic<bool> finished = false;
+ std::thread server(
+ [](std::atomic<bool> *finished, rpc_device_t device) {
+ while (!*finished) {
+ if (rpc_status_t err = rpc_handle_server(device))
+ handle_error(err);
+ }
+ },
+ &finished, rpc_device);
+
// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
@@ -231,17 +243,13 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);
- // Wait until the kernel has completed execution on the device. Periodically
- // check the RPC client for work to be performed on the server.
- while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
- if (rpc_status_t err = rpc_handle_server(rpc_device))
- handle_error(err);
-
- // Handle the server one more time in case the kernel exited with a pending
- // send still in flight.
- if (rpc_status_t err = rpc_handle_server(rpc_device))
+ if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
+ finished = true;
+ if (server.joinable())
+ server.join();
+
return CUDA_SUCCESS;
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changes look reasonable to me
Summary:
Make a separate thread to run the server when we launch. This is
required by CUDA, which you can force with
export CUDA_LAUNCH_BLOCKING=1
. I figured I might as well be consistent and doit for the AMD implementation as well even though I believe it's not
necessary.