[libc] Make a dedicated thread for the RPC server #111210

jhuber6 · 2024-10-04T21:09:13Z

Summary:
Make a separate thread to run the server when we launch. This is
required by CUDA, which you can force with export CUDA_LAUNCH_BLOCKING=1. I figured I might as well be consistent and do
it for the AMD implementation as well even though I believe it's not
necessary.

Summary: Make a separate thread to run the server when we launch. This is required by CUDA, which you can force with `export CUDA_LAUNCH_BLOCKING=1`. I figured I might as well be consistent and do it for the AMD implementation as well even though I believe it's not necessary.

llvmbot · 2024-10-04T21:09:45Z

@llvm/pr-subscribers-libc

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

Changes

Summary:
Make a separate thread to run the server when we launch. This is
required by CUDA, which you can force with export CUDA_LAUNCH_BLOCKING=1. I figured I might as well be consistent and do
it for the AMD implementation as well even though I believe it's not
necessary.

Full diff: https://github.com/llvm/llvm-project/pull/111210.diff

2 Files Affected:

(modified) libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp (+19-9)
(modified) libc/utils/gpu/loader/nvptx/nvptx-loader.cpp (+17-9)

diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index ca13414519d4ca..cb81a866622f93 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -28,9 +28,11 @@
 #include "hsa/hsa_ext_amd.h"
 #endif
 
+#include <atomic>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <thread>
 #include <tuple>
 #include <utility>
 
@@ -289,18 +291,26 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
+  std::atomic<bool> finished = false;
+  std::thread server(
+      [](std::atomic<bool> *finished, rpc_device_t device) {
+        while (!*finished) {
+          if (rpc_status_t err = rpc_handle_server(device))
+            handle_error(err);
+        }
+      },
+      &finished, device);
+
   // Wait until the kernel has completed execution on the device. Periodically
   // check the RPC client for work to be performed on the server.
-  while (hsa_signal_wait_scacquire(
-             packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
-             /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
-    if (rpc_status_t err = rpc_handle_server(device))
-      handle_error(err);
+  while (hsa_signal_wait_scacquire(packet->completion_signal,
+                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
+                                   HSA_WAIT_STATE_BLOCKED) != 0)
+    ;
 
-  // Handle the server one more time in case the kernel exited with a pending
-  // send still in flight.
-  if (rpc_status_t err = rpc_handle_server(device))
-    handle_error(err);
+  finished = true;
+  if (server.joinable())
+    server.join();
 
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
diff --git a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
index 1b210b9e7a896f..58e5e5f04d0a70 100644
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -20,10 +20,12 @@
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 
+#include <atomic>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <thread>
 #include <vector>
 
 using namespace llvm;
@@ -224,6 +226,16 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   if (print_resource_usage)
     print_kernel_resources(binary, kernel_name);
 
+  std::atomic<bool> finished = false;
+  std::thread server(
+      [](std::atomic<bool> *finished, rpc_device_t device) {
+        while (!*finished) {
+          if (rpc_status_t err = rpc_handle_server(device))
+            handle_error(err);
+        }
+      },
+      &finished, rpc_device);
+
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
           function, params.num_blocks_x, params.num_blocks_y,
@@ -231,17 +243,13 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
           params.num_threads_z, 0, stream, nullptr, args_config))
     handle_error(err);
 
-  // Wait until the kernel has completed execution on the device. Periodically
-  // check the RPC client for work to be performed on the server.
-  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    if (rpc_status_t err = rpc_handle_server(rpc_device))
-      handle_error(err);
-
-  // Handle the server one more time in case the kernel exited with a pending
-  // send still in flight.
-  if (rpc_status_t err = rpc_handle_server(rpc_device))
+  if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
+  finished = true;
+  if (server.joinable())
+    server.join();
+
   return CUDA_SUCCESS;
 }

jplehr

Changes look reasonable to me

jhuber6 requested review from JonChesterfield, jplehr and shiltian October 4, 2024 21:09

llvmbot added backend:AMDGPU libc labels Oct 4, 2024

jplehr approved these changes Oct 7, 2024

View reviewed changes

jhuber6 merged commit ee57a68 into llvm:main Oct 7, 2024
11 checks passed

jhuber6 deleted the server_thread branch October 14, 2024 19:20

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[libc] Make a dedicated thread for the RPC server #111210

[libc] Make a dedicated thread for the RPC server #111210

jhuber6 commented Oct 4, 2024

llvmbot commented Oct 4, 2024 •

edited

Loading

jplehr left a comment

[libc] Make a dedicated thread for the RPC server #111210

[libc] Make a dedicated thread for the RPC server #111210

Conversation

jhuber6 commented Oct 4, 2024

llvmbot commented Oct 4, 2024 • edited Loading

jplehr left a comment

Choose a reason for hiding this comment

llvmbot commented Oct 4, 2024 •

edited

Loading