Skip to content

Commit

Permalink
[libc] Make a dedicated thread for the RPC server (#111210)
Browse files Browse the repository at this point in the history
Summary:
Make a separate thread to run the server when we launch. This is
required by CUDA, which you can force with `export
CUDA_LAUNCH_BLOCKING=1`. I figured I might as well be consistent and do
it for the AMD implementation as well even though I believe it's not
necessary.
  • Loading branch information
jhuber6 authored Oct 7, 2024
1 parent 1062007 commit ee57a68
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 18 deletions.
28 changes: 19 additions & 9 deletions libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
#include "hsa/hsa_ext_amd.h"
#endif

#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <thread>
#include <tuple>
#include <utility>

Expand Down Expand Up @@ -289,18 +291,26 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);

std::atomic<bool> finished = false;
std::thread server(
[](std::atomic<bool> *finished, rpc_device_t device) {
while (!*finished) {
if (rpc_status_t err = rpc_handle_server(device))
handle_error(err);
}
},
&finished, device);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (hsa_signal_wait_scacquire(
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
/*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
if (rpc_status_t err = rpc_handle_server(device))
handle_error(err);
while (hsa_signal_wait_scacquire(packet->completion_signal,
HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED) != 0)
;

// Handle the server one more time in case the kernel exited with a pending
// send still in flight.
if (rpc_status_t err = rpc_handle_server(device))
handle_error(err);
finished = true;
if (server.joinable())
server.join();

// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
Expand Down
26 changes: 17 additions & 9 deletions libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@
#include "llvm/Object/ELF.h"
#include "llvm/Object/ELFObjectFile.h"

#include <atomic>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <thread>
#include <vector>

using namespace llvm;
Expand Down Expand Up @@ -224,24 +226,30 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
if (print_resource_usage)
print_kernel_resources(binary, kernel_name);

std::atomic<bool> finished = false;
std::thread server(
[](std::atomic<bool> *finished, rpc_device_t device) {
while (!*finished) {
if (rpc_status_t err = rpc_handle_server(device))
handle_error(err);
}
},
&finished, rpc_device);

// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
if (rpc_status_t err = rpc_handle_server(rpc_device))
handle_error(err);

// Handle the server one more time in case the kernel exited with a pending
// send still in flight.
if (rpc_status_t err = rpc_handle_server(rpc_device))
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);

finished = true;
if (server.joinable())
server.join();

return CUDA_SUCCESS;
}

Expand Down

0 comments on commit ee57a68

Please sign in to comment.