diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8c338534dc5cc..7ac0e5f6ef15d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -340,6 +340,36 @@ jobs: cd build ctest -L main --verbose + ubuntu-latest-cmake-rpc: + runs-on: ubuntu-latest + + continue-on-error: true + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake -DLLAMA_RPC=ON .. + cmake --build . --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose + ubuntu-22-cmake-vulkan: runs-on: ubuntu-22.04 @@ -663,6 +693,8 @@ jobs: strategy: matrix: include: + - build: 'rpc' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON' - build: 'noavx' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' - build: 'avx2' diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c3b5c8e423cc..feb6f39d09662 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) +option(LLAMA_RPC "llama: use RPC" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) @@ -494,6 +495,17 @@ if (LLAMA_MPI) endif() endif() +if (LLAMA_RPC) + add_compile_definitions(GGML_USE_RPC) + + if (WIN32) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32) + endif() + + set(GGML_HEADERS_RPC ggml-rpc.h) + set(GGML_SOURCES_RPC ggml-rpc.cpp) +endif() + if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) @@ -1176,6 +1188,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} diff --git a/common/common.cpp b/common/common.cpp index ba1ecf0e59c8b..96130ad543553 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa #endif // GGML_USE_CUDA_SYCL_VULKAN return true; } + if (arg == "--rpc") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rpc_servers = argv[i]; + return true; + } if (arg == "--no-mmap") { params.use_mmap = false; return true; @@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); } + printf(" --rpc SERVERS comma separated list of RPC servers\n"); printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); @@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; diff --git a/common/common.h b/common/common.h index d80344f2a61bb..566490e2f881a 100644 --- a/common/common.h +++ b/common/common.h @@ -82,6 +82,7 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + std::string rpc_servers = ""; // comma separated list of RPC servers ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f421769cc2f0a..b40ee4ccb2ec1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -49,4 +49,7 @@ else() add_subdirectory(server) endif() add_subdirectory(export-lora) + if (LLAMA_RPC) + add_subdirectory(rpc) + endif() endif() diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt new file mode 100644 index 0000000000000..ae48fb98d0913 --- /dev/null +++ b/examples/rpc/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(rpc-server rpc-server.cpp) +target_link_libraries(rpc-server PRIVATE ggml llama) diff --git a/examples/rpc/README.md b/examples/rpc/README.md new file mode 100644 index 0000000000000..325d0abc4b929 --- /dev/null +++ b/examples/rpc/README.md @@ -0,0 +1,74 @@ +## Overview + +The `rpc-server` allows running `ggml` backend on a remote host. +The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. +This can be used for distributed LLM inference with `llama.cpp` in the following way: + +```mermaid +flowchart TD + rpcb---|TCP|srva + rpcb---|TCP|srvb + rpcb-.-|TCP|srvn + subgraph hostn[Host N] + srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] + end + subgraph hostb[Host B] + srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] + end + subgraph hosta[Host A] + srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] + end + subgraph host[Main Host] + ggml[llama.cpp]---rpcb[RPC backend] + end + style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 +``` + +Each host can run a different backend, e.g. one with CUDA and another with Metal. +You can also run multiple `rpc-server` instances on the same host, each with a different backend. + +## Usage + +On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. +For example, to build the CUDA backend with RPC support: + +```bash +mkdir build-rpc-cuda +cd build-rpc-cuda +cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON +cmake --build . --config Release +``` + +Then, start the `rpc-server` with the backend: + +```bash +$ bin/rpc-server 0.0.0.0 50052 +create_backend: using CUDA backend +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes +Starting RPC server on 0.0.0.0:50052 +``` + +When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.: +```bash +$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052 +``` +This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. + + +On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: + +```bash +mkdir build-rpc +cd build-rpc +cmake .. -DLLAMA_RPC=ON +cmake --build . --config Release +``` + +Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: + +```bash +$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 +``` diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp new file mode 100644 index 0000000000000..496af84962de3 --- /dev/null +++ b/examples/rpc/rpc-server.cpp @@ -0,0 +1,70 @@ +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include "ggml-rpc.h" +#include +#include + +static ggml_backend_t create_backend() { + ggml_backend_t backend = NULL; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#elif GGML_USE_METAL + fprintf(stderr, "%s: using Metal backend\n", __func__); + backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + fprintf(stderr, "%s: using CPU backend\n", __func__); + backend = ggml_backend_cpu_init(); + } + return backend; +} + +static void get_backend_memory(size_t * free_mem, size_t * total_mem) { +#ifdef GGML_USE_CUDA + ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); +#else + // TODO: implement for other backends + *free_mem = 1; + *total_mem = 1; +#endif +} + +int main(int argc, char * argv[]) { + if (argc < 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + const char * host = argv[1]; + int port = std::stoi(argv[2]); + if (port <= 0 || port > 65535) { + fprintf(stderr, "Invalid port number: %d\n", port); + return 1; + } + ggml_backend_t backend = create_backend(); + if (!backend) { + fprintf(stderr, "Failed to create backend\n"); + return 1; + } + printf("Starting RPC server on %s:%d\n", host, port); + size_t free_mem, total_mem; + get_backend_memory(&free_mem, &total_mem); + std::string endpoint = std::string(host) + ":" + std::to_string(port); + start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); + ggml_backend_free(backend); + return 0; +} diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp new file mode 100644 index 0000000000000..efeacb297675b --- /dev/null +++ b/ggml-rpc.cpp @@ -0,0 +1,1023 @@ +#include "ggml-rpc.h" +#include "ggml.h" +#include "ggml-backend-impl.h" + +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +# include +# include +# include +# include +# include +#endif +#include + +#define UNUSED GGML_UNUSED + +#define GGML_DEBUG 1 +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#ifdef _WIN32 +typedef SOCKET sockfd_t; +using ssize_t = __int64; +#else +typedef int sockfd_t; +#endif + +// cross-platform socket +struct socket_t { + sockfd_t fd; + socket_t(sockfd_t fd) : fd(fd) {} + ~socket_t() { +#ifdef _WIN32 + closesocket(this->fd); +#else + close(this->fd); +#endif + } +}; + +// ggml_tensor is serialized into rpc_tensor +struct rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; +}; + +// RPC commands +enum rpc_cmd { + ALLOC_BUFFER = 0, + GET_ALIGNMENT, + GET_MAX_SIZE, + BUFFER_GET_BASE, + FREE_BUFFER, + BUFFER_CLEAR, + SET_TENSOR, + GET_TENSOR, + COPY_TENSOR, + GRAPH_COMPUTE, + GET_DEVICE_MEMORY, +}; + +// RPC data structures + +static ggml_guid_t ggml_backend_rpc_guid() { + static ggml_guid guid = {0x99, 0x68, 0x5b, 0x6c, 0xd2, 0x83, 0x3d, 0x24, 0x25, 0x36, 0x72, 0xe1, 0x5b, 0x0e, 0x14, 0x03}; + return &guid; +} + +struct ggml_backend_rpc_buffer_type_context { + std::shared_ptr sock; + std::string name; + size_t alignment; + size_t max_size; +}; + +struct ggml_backend_rpc_context { + std::string endpoint; + std::string name; + std::shared_ptr sock; + ggml_backend_buffer_type_t buft; +}; + +struct ggml_backend_rpc_buffer_context { + std::shared_ptr sock; + std::unordered_map base_cache; + uint64_t remote_ptr; + std::string name; +}; + +// RPC helper functions + +static std::shared_ptr make_socket(sockfd_t fd) { +#ifdef _WIN32 + if (fd == INVALID_SOCKET) { + return nullptr; + } +#else + if (fd < 0) { + return nullptr; + } +#endif + return std::make_shared(fd); +} + +static bool set_no_delay(sockfd_t sockfd) { + int flag = 1; + // set TCP_NODELAY to disable Nagle's algorithm + int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)); + return ret >= 0; +} + +static std::shared_ptr socket_connect(const char * host, int port) { + struct sockaddr_in addr; + auto sockfd = socket(AF_INET, SOCK_STREAM, 0); + auto sock_ptr = make_socket(sockfd); + if (sock_ptr == nullptr) { + return nullptr; + } + if (!set_no_delay(sockfd)) { + fprintf(stderr, "Failed to set TCP_NODELAY\n"); + return nullptr; + } + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + struct hostent * server = gethostbyname(host); + if (server == NULL) { + fprintf(stderr, "Cannot resolve host '%s'\n", host); + return nullptr; + } + memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length); + if (connect(sock_ptr->fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + return nullptr; + } + return sock_ptr; +} + +static std::shared_ptr socket_accept(sockfd_t srv_sockfd) { + auto client_socket_fd = accept(srv_sockfd, NULL, NULL); + auto client_socket = make_socket(client_socket_fd); + if (client_socket == nullptr) { + return nullptr; + } + if (!set_no_delay(client_socket_fd)) { + fprintf(stderr, "Failed to set TCP_NODELAY\n"); + return nullptr; + } + return client_socket; +} + +static std::shared_ptr create_server_socket(const char * host, int port) { + auto sockfd = socket(AF_INET, SOCK_STREAM, 0); + auto sock = make_socket(sockfd); + if (sock == nullptr) { + return nullptr; + } + + struct sockaddr_in serv_addr; + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = inet_addr(host); + serv_addr.sin_port = htons(port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { + return nullptr; + } + if (listen(sockfd, 1) < 0) { + return nullptr; + } + return sock; +} + +static bool send_data(sockfd_t sockfd, const void * data, size_t size) { + size_t bytes_sent = 0; + while (bytes_sent < size) { + ssize_t n = send(sockfd, (const char *)data + bytes_sent, size - bytes_sent, 0); + if (n < 0) { + return false; + } + bytes_sent += n; + } + return true; +} + +static bool recv_data(sockfd_t sockfd, void * data, size_t size) { + size_t bytes_recv = 0; + while (bytes_recv < size) { + ssize_t n = recv(sockfd, (char *)data + bytes_recv, size - bytes_recv, 0); + if (n <= 0) { + return false; + } + bytes_recv += n; + } + return true; +} + +static bool parse_endpoint(const char * endpoint, std::string & host, int & port) { + std::string str(endpoint); + size_t pos = str.find(':'); + if (pos == std::string::npos) { + return false; + } + host = str.substr(0, pos); + port = std::stoi(str.substr(pos + 1)); + return true; +} + +// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) | +// RPC response: | response_size (8 bytes) | response_data (response_size bytes) | +static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const std::vector & input, std::vector & output) { + uint8_t cmd_byte = cmd; + if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) { + return false; + } + uint64_t input_size = input.size(); + if (!send_data(sock->fd, &input_size, sizeof(input_size))) { + return false; + } + if (!send_data(sock->fd, input.data(), input.size())) { + return false; + } + uint64_t output_size; + if (!recv_data(sock->fd, &output_size, sizeof(output_size))) { + return false; + } + if (output_size == 0) { + output.clear(); + return true; + } + output.resize(output_size); + if (!recv_data(sock->fd, output.data(), output_size)) { + return false; + } + return true; +} + +// RPC client-side implementation + +GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + // input serialization format: | remote_ptr (8 bytes) | + std::vector input(sizeof(uint64_t), 0); + uint64_t remote_ptr = ctx->remote_ptr; + memcpy(input.data(), &remote_ptr, sizeof(remote_ptr)); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, FREE_BUFFER, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.empty()); + delete ctx; +} + +GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) { + return ctx->base_cache[buffer]; + } + // input serialization format: | remote_ptr (8 bytes) | + std::vector input(sizeof(uint64_t), 0); + uint64_t remote_ptr = ctx->remote_ptr; + memcpy(input.data(), &remote_ptr, sizeof(remote_ptr)); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, BUFFER_GET_BASE, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == sizeof(uint64_t)); + // output serialization format: | base_ptr (8 bytes) | + uint64_t base_ptr; + memcpy(&base_ptr, output.data(), sizeof(base_ptr)); + void * base = reinterpret_cast(base_ptr); + ctx->base_cache[buffer] = base; + return base; +} + +static rpc_tensor serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + result.buffer = ctx->remote_ptr; + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor->data); + ggml_set_name(result, tensor->name); + return result; +} + +GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + UNUSED(buffer); + if (ggml_is_quantized(tensor->type)) { + // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized + GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor"); + } +} + +GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) | + size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size; + std::vector input(input_size, 0); + rpc_tensor rpc_tensor = serialize_tensor(tensor); + memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); + memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); + memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, SET_TENSOR, input, output); + GGML_ASSERT(status); +} + +GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + // input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) | + int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t); + std::vector input(input_size, 0); + rpc_tensor rpc_tensor = serialize_tensor(tensor); + memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); + memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset)); + memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &size, sizeof(size)); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, GET_TENSOR, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == size); + // output serialization format: | data (size bytes) | + memcpy(data, output.data(), size); +} + +GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + // check if src and dst are on the same server + ggml_backend_buffer_t src_buffer = src->buffer; + ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context; + ggml_backend_buffer_t dst_buffer = dst->buffer; + ggml_backend_rpc_buffer_context * dst_ctx = (ggml_backend_rpc_buffer_context *)dst_buffer->context; + if (src_ctx->sock != dst_ctx->sock) { + return false; + } + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + // input serialization format: | rpc_tensor src | rpc_tensor dst | + int input_size = 2*sizeof(rpc_tensor); + std::vector input(input_size, 0); + rpc_tensor rpc_src = serialize_tensor(src); + rpc_tensor rpc_dst = serialize_tensor(dst); + memcpy(input.data(), &rpc_src, sizeof(rpc_src)); + memcpy(input.data() + sizeof(rpc_src), &rpc_dst, sizeof(rpc_dst)); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, COPY_TENSOR, input, output); + GGML_ASSERT(status); + // output serialization format: | result (1 byte) | + GGML_ASSERT(output.size() == 1); + return output[0]; +} + +GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; + // serialization format: | bufptr (8 bytes) | value (1 byte) | + int input_size = sizeof(uint64_t) + sizeof(uint8_t); + std::vector input(input_size, 0); + memcpy(input.data(), &ctx->remote_ptr, sizeof(ctx->remote_ptr)); + memcpy(input.data() + sizeof(ctx->remote_ptr), &value, sizeof(value)); + std::vector output; + bool status = send_rpc_cmd(ctx->sock, BUFFER_CLEAR, input, output); + GGML_ASSERT(status); +} + +static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { + /* .get_name = */ ggml_backend_rpc_buffer_get_name, + /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer, + /* .get_base = */ ggml_backend_rpc_buffer_get_base, + /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor, + /* .clear = */ ggml_backend_rpc_buffer_clear, + /* .reset = */ NULL, +}; + +GGML_CALL static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + return buft_ctx->name.c_str(); +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + // input serialization format: | size (8 bytes) | + int input_size = sizeof(uint64_t); + std::vector input(input_size, 0); + memcpy(input.data(), &size, sizeof(size)); + std::vector output; + bool status = send_rpc_cmd(buft_ctx->sock, ALLOC_BUFFER, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == 2*sizeof(uint64_t)); + // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) | + uint64_t remote_ptr; + memcpy(&remote_ptr, output.data(), sizeof(remote_ptr)); + size_t remote_size; + memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size)); + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, + ggml_backend_rpc_buffer_interface, + new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"}, + remote_size); + + return buffer; +} + +static size_t get_alignment(const std::shared_ptr & sock) { + // input serialization format: | 0 bytes | + std::vector input; + std::vector output; + bool status = send_rpc_cmd(sock, GET_ALIGNMENT, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == sizeof(uint64_t)); + // output serialization format: | alignment (8 bytes) | + uint64_t alignment; + memcpy(&alignment, output.data(), sizeof(alignment)); + return alignment; +} + +GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + return buft_ctx->alignment; +} + +static size_t get_max_size(const std::shared_ptr & sock) { + // input serialization format: | 0 bytes | + std::vector input; + std::vector output; + bool status = send_rpc_cmd(sock, GET_MAX_SIZE, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == sizeof(uint64_t)); + // output serialization format: | max_size (8 bytes) | + uint64_t max_size; + memcpy(&max_size, output.data(), sizeof(max_size)); + return max_size; +} + +GGML_CALL static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) { + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + return buft_ctx->max_size; +} + +GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + UNUSED(buft); + return ggml_nbytes(tensor); +} + +GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + if (!ggml_backend_is_rpc(backend)) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + return buft_ctx->sock == rpc_ctx->sock; +} + +static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { + /* .get_name = */ ggml_backend_rpc_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_rpc_get_max_size, + /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + + +GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) { + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + + return rpc_ctx->name.c_str(); +} + +GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) { + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context; + delete buft_ctx; + delete rpc_ctx->buft; + delete rpc_ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context; + return ctx->buft; +} + +GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { + UNUSED(backend); + // this is no-op because we don't have any async operations +} + +static void add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +static void serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + out_nodes[i] = reinterpret_cast(cgraph->nodes[i]); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} + +GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + std::vector input; + serialize_graph(cgraph, input); + std::vector output; + bool status = send_rpc_cmd(rpc_ctx->sock, GRAPH_COMPUTE, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == 1); + return (enum ggml_status)output[0]; +} + +GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + UNUSED(backend); + UNUSED(op); + GGML_ASSERT(false && "not implemented"); + return false; +} + +static ggml_backend_i ggml_backend_rpc_interface = { + /* .get_name = */ ggml_backend_rpc_name, + /* .free = */ ggml_backend_rpc_free, + /* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ ggml_backend_rpc_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_rpc_graph_compute, + /* .supports_op = */ ggml_backend_rpc_supports_op, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static std::unordered_map instances; + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { + ggml_backend_t backend = ggml_backend_rpc_init(endpoint); + return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr; +} + +GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { + std::string endpoint_str(endpoint); + if (instances.find(endpoint_str) != instances.end()) { + return instances[endpoint_str]; + } +#ifdef _WIN32 + { + WSADATA wsaData; + int res = WSAStartup(MAKEWORD(2, 2), &wsaData); + if (res != 0) { + return nullptr; + } + } +#endif + GGML_PRINT_DEBUG("Connecting to %s\n", endpoint); + std::string host; + int port; + if (!parse_endpoint(endpoint, host, port)) { + return nullptr; + } + auto sock = socket_connect(host.c_str(), port); + if (sock == nullptr) { + return nullptr; + } + size_t alignment = get_alignment(sock); + size_t max_size = get_max_size(sock); + ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context { + /* .sock = */ sock, + /* .name = */ "RPC" + std::to_string(sock->fd), + /* .alignment = */ alignment, + /* .max_size = */ max_size + }; + + ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type { + /* .iface = */ ggml_backend_rpc_buffer_type_interface, + /* .context = */ buft_ctx + }; + + ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { + /* .endpoint = */ endpoint, + /* .name = */ "RPC" + std::to_string(sock->fd), + /* .sock = */ sock, + /* .buft = */ buft + }; + + instances[endpoint] = new ggml_backend { + /* .guid = */ ggml_backend_rpc_guid(), + /* .interface = */ ggml_backend_rpc_interface, + /* .context = */ ctx + }; + + return instances[endpoint]; +} + +GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid()); +} + +static void get_device_memory(const std::shared_ptr & sock, size_t * free, size_t * total) { + // input serialization format: | 0 bytes | + std::vector input; + std::vector output; + bool status = send_rpc_cmd(sock, GET_DEVICE_MEMORY, input, output); + GGML_ASSERT(status); + GGML_ASSERT(output.size() == 2*sizeof(uint64_t)); + // output serialization format: | free (8 bytes) | total (8 bytes) | + uint64_t free_mem; + memcpy(&free_mem, output.data(), sizeof(free_mem)); + uint64_t total_mem; + memcpy(&total_mem, output.data() + sizeof(uint64_t), sizeof(total_mem)); + *free = free_mem; + *total = total_mem; +} + +GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { + ggml_backend_t backend = ggml_backend_rpc_init(endpoint); + if (backend == nullptr) { + *free = 0; + *total = 0; + return; + } + ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context; + get_device_memory(ctx->sock, free, total); +} + +// RPC server-side implementation + +static void rpc_alloc_buffer(ggml_backend_t backend, const std::vector & input, std::vector & output) { + // input serialization format: | size (8 bytes) | + uint64_t size; + memcpy(&size, input.data(), sizeof(size)); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); + uint64_t remote_ptr = reinterpret_cast(buffer); + uint64_t remote_size = buffer->size; + GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size); + // output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) | + output.resize(2*sizeof(uint64_t), 0); + memcpy(output.data(), &remote_ptr, sizeof(remote_ptr)); + memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size)); +} + +static void rpc_get_alignment(ggml_backend_t backend, std::vector & output) { + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + size_t alignment = ggml_backend_buft_get_alignment(buft); + GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment); + // output serialization format: | alignment (8 bytes) | + output.resize(sizeof(uint64_t), 0); + memcpy(output.data(), &alignment, sizeof(alignment)); +} + +static void rpc_get_max_size(ggml_backend_t backend, std::vector & output) { + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); + size_t max_size = ggml_backend_buft_get_max_size(buft); + GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size); + // output serialization format: | max_size (8 bytes) | + output.resize(sizeof(uint64_t), 0); + memcpy(output.data(), &max_size, sizeof(max_size)); +} + +static void rpc_buffer_get_base(const std::vector & input, std::vector & output) { + // input serialization format: | remote_ptr (8 bytes) | + uint64_t remote_ptr; + memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr); + ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); + void * base = ggml_backend_buffer_get_base(buffer); + // output serialization format: | base_ptr (8 bytes) | + uint64_t base_ptr = reinterpret_cast(base); + output.resize(sizeof(uint64_t), 0); + memcpy(output.data(), &base_ptr, sizeof(base_ptr)); +} + +static void rpc_free_buffer(const std::vector & input) { + // input serialization format: | remote_ptr (8 bytes) | + uint64_t remote_ptr; + memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr); + ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); + ggml_backend_buffer_free(buffer); +} + +static void rpc_buffer_clear(const std::vector & input) { + // input serialization format: | remote_ptr (8 bytes) | value (1 byte) | + uint64_t remote_ptr; + memcpy(&remote_ptr, input.data(), sizeof(remote_ptr)); + uint8_t value; + memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value)); + GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value); + ggml_backend_buffer_t buffer = reinterpret_cast(remote_ptr); + ggml_backend_buffer_clear(buffer, value); +} + +static void rpc_set_tensor(const std::vector & input) { + // serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) | + const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); + uint64_t offset; + memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); + size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); + + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); + GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); + ggml_backend_tensor_set(tensor, data, offset, size); + ggml_free(ctx); +} + +static void rpc_get_tensor(const std::vector & input, std::vector & output) { + // serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) | + const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); + uint64_t offset; + memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); + uint64_t size; + memcpy(&size, input.data() + sizeof(rpc_tensor) + sizeof(offset), sizeof(size)); + + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor); + GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); + // output serialization format: | data (size bytes) | + output.resize(size, 0); + ggml_backend_tensor_get(tensor, output.data(), offset, size); + ggml_free(ctx); +} + +static void rpc_copy_tensor(const std::vector & input, std::vector & output) { + // serialization format: | rpc_tensor src | rpc_tensor dst | + const rpc_tensor * rpc_src = (const rpc_tensor *)input.data(); + const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src)); + + struct ggml_init_params params { + /*.mem_size =*/ 2*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + ggml_tensor * src = deserialize_tensor(ctx, rpc_src); + ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst); + GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer); + bool result = ggml_backend_buffer_copy_tensor(src, dst); + // output serialization format: | result (1 byte) | + output.resize(1, 0); + output[0] = result; + ggml_free(ctx); +} + +static struct ggml_tensor * create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +static void rpc_graph_compute(ggml_backend_t backend, const std::vector & input, std::vector & output) { + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_nodes; + memcpy(&n_nodes, input.data(), sizeof(n_nodes)); + const uint64_t * nodes = (const uint64_t *)(input.data() + sizeof(n_nodes)); + uint32_t n_tensors; + memcpy(&n_tensors, input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t), sizeof(n_tensors)); + const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors)); + GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors); + + static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map); + } + ggml_status status = ggml_backend_graph_compute(backend, graph); + // output serialization format: | status (1 byte) | + output.resize(1, 0); + output[0] = status; + ggml_free(ctx); +} + +static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) { + while (true) { + uint8_t cmd; + if (!recv_data(sockfd, &cmd, 1)) { + break; + } + std::vector input; + std::vector output; + uint64_t input_size; + if (!recv_data(sockfd, &input_size, sizeof(input_size))) { + break; + } + input.resize(input_size); + if (!recv_data(sockfd, input.data(), input_size)) { + break; + } + switch (cmd) { + case ALLOC_BUFFER: { + rpc_alloc_buffer(backend, input, output); + break; + } + case GET_ALIGNMENT: { + rpc_get_alignment(backend, output); + break; + } + case GET_MAX_SIZE: { + rpc_get_max_size(backend, output); + break; + } + case BUFFER_GET_BASE: { + rpc_buffer_get_base(input, output); + break; + } + case FREE_BUFFER: { + rpc_free_buffer(input); + break; + } + case BUFFER_CLEAR: { + rpc_buffer_clear(input); + break; + } + case SET_TENSOR: { + rpc_set_tensor(input); + break; + } + case GET_TENSOR: { + rpc_get_tensor(input, output); + break; + } + case COPY_TENSOR: { + rpc_copy_tensor(input, output); + break; + } + case GRAPH_COMPUTE: { + rpc_graph_compute(backend, input, output); + break; + } + case GET_DEVICE_MEMORY: { + // output serialization format: | free (8 bytes) | total (8 bytes) | + output.resize(2*sizeof(uint64_t), 0); + memcpy(output.data(), &free_mem, sizeof(free_mem)); + memcpy(output.data() + sizeof(uint64_t), &total_mem, sizeof(total_mem)); + break; + } + default: { + fprintf(stderr, "Unknown command: %d\n", cmd); + return; + } + } + uint64_t output_size = output.size(); + if (!send_data(sockfd, &output_size, sizeof(output_size))) { + break; + } + if (!send_data(sockfd, output.data(), output_size)) { + break; + } + } +} + +void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { + std::string host; + int port; + if (!parse_endpoint(endpoint, host, port)) { + return; + } +#ifdef _WIN32 + { + WSADATA wsaData; + int res = WSAStartup(MAKEWORD(2, 2), &wsaData); + if (res != 0) { + fprintf(stderr, "WSAStartup failed: %d\n", res); + return; + } + } +#endif + auto server_socket = create_server_socket(host.c_str(), port); + if (server_socket == nullptr) { + fprintf(stderr, "Failed to create server socket\n"); + return; + } + while (true) { + auto client_socket = socket_accept(server_socket->fd); + if (client_socket == nullptr) { + fprintf(stderr, "Failed to accept client connection\n"); + return; + } + printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); + rpc_serve_client(backend, client_socket->fd, free_mem, total_mem); + printf("Client connection closed\n"); + } +#ifdef _WIN32 + WSACleanup(); +#endif +} diff --git a/ggml-rpc.h b/ggml-rpc.h new file mode 100644 index 0000000000000..aa144832a6e1e --- /dev/null +++ b/ggml-rpc.h @@ -0,0 +1,24 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_RPC_MAX_SERVERS 16 + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); +GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); + +GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); + +GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index ad35e4a2e5306..7d26966e49110 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,6 +7,10 @@ #include "ggml-alloc.h" #include "ggml-backend.h" +#ifdef GGML_USE_RPC +# include "ggml-rpc.h" +#endif + #ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -1685,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer GGML_UNUSED(host_buffer); } -static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { - ggml_backend_buffer_type_t buft = nullptr; - -#ifdef GGML_USE_METAL - buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUDA) - buft = ggml_backend_cuda_buffer_type(gpu); -#elif defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(gpu); -#elif defined(GGML_USE_SYCL) - buft = ggml_backend_sycl_buffer_type(gpu); -#elif defined(GGML_USE_CLBLAST) - buft = ggml_backend_opencl_buffer_type(); -#elif defined(GGML_USE_KOMPUTE) - buft = ggml_backend_kompute_buffer_type(gpu); - if (buft == nullptr) { - LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); - } -#endif - - if (buft == nullptr) { - buft = llama_default_buffer_type_cpu(true); - } - return buft; - - GGML_UNUSED(gpu); -} - -static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { - ggml_backend_buffer_type_t buft = nullptr; - -#ifdef GGML_USE_CUDA - if (ggml_backend_cuda_get_device_count() > 1) { - buft = ggml_backend_cuda_split_buffer_type(tensor_split); - } -#endif - -#ifdef GGML_USE_SYCL - if (ggml_backend_sycl_get_device_count() > 1) { - buft = ggml_backend_sycl_split_buffer_type(tensor_split); - } -#endif - - if (buft == nullptr) { - buft = llama_default_buffer_type_offload(fallback_gpu); - } - return buft; - - GGML_UNUSED(tensor_split); -} - -static size_t llama_get_device_count() { -#if defined(GGML_USE_CUDA) - return ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - return ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - return ggml_backend_vk_get_device_count(); -#else - return 1; -#endif -} - -static size_t llama_get_device_memory(int device) { -#if defined(GGML_USE_CUDA) - size_t total; - size_t free; - ggml_backend_cuda_get_device_memory(device, &free, &total); - return free; -#elif defined(GGML_USE_SYCL) - size_t total; - size_t free; - ggml_backend_sycl_get_device_memory(device, &free, &total); - return free; -#elif defined(GGML_USE_VULKAN) - size_t total; - size_t free; - ggml_backend_vk_get_device_memory(device, &free, &total); - return free; -#else - return 1; - GGML_UNUSED(device); -#endif -} - // // globals // @@ -2210,6 +2129,8 @@ struct llama_model { int main_gpu; int n_gpu_layers; + std::vector rpc_servers; + // gguf metadata std::unordered_map gguf_kv; @@ -2353,6 +2274,104 @@ struct llama_context { #endif }; +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_RPC + std::string endpoint = model.rpc_servers[gpu]; + buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); +#elif defined(GGML_USE_METAL) + buft = ggml_backend_metal_buffer_type(); +#elif defined(GGML_USE_CUDA) + buft = ggml_backend_cuda_buffer_type(gpu); +#elif defined(GGML_USE_VULKAN) + buft = ggml_backend_vk_buffer_type(gpu); +#elif defined(GGML_USE_SYCL) + buft = ggml_backend_sycl_buffer_type(gpu); +#elif defined(GGML_USE_CLBLAST) + buft = ggml_backend_opencl_buffer_type(); +#elif defined(GGML_USE_KOMPUTE) + buft = ggml_backend_kompute_buffer_type(gpu); + if (buft == nullptr) { + LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + GGML_UNUSED(model); + GGML_UNUSED(gpu); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_CUDA + if (ggml_backend_cuda_get_device_count() > 1) { + buft = ggml_backend_cuda_split_buffer_type(tensor_split); + } +#endif + +#ifdef GGML_USE_SYCL + if (ggml_backend_sycl_get_device_count() > 1) { + buft = ggml_backend_sycl_split_buffer_type(tensor_split); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(model, fallback_gpu); + } + return buft; + + GGML_UNUSED(tensor_split); +} + +static size_t llama_get_device_count(const llama_model & model) { +#if defined(GGML_USE_RPC) + return model.rpc_servers.size(); +#elif defined(GGML_USE_CUDA) + return ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + return ggml_backend_sycl_get_device_count(); +#elif defined(GGML_USE_VULKAN) + return ggml_backend_vk_get_device_count(); +#else + return 1; +#endif + GGML_UNUSED(model); +} + +static size_t llama_get_device_memory(const llama_model & model, int device) { +#if defined(GGML_USE_RPC) + size_t total; + size_t free; + std::string endpoint = model.rpc_servers[device]; + ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); + return free; +#elif defined(GGML_USE_CUDA) + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(device, &free, &total); + return free; +#elif defined(GGML_USE_SYCL) + size_t total; + size_t free; + ggml_backend_sycl_get_device_memory(device, &free, &total); + return free; +#elif defined(GGML_USE_VULKAN) + size_t total; + size_t free; + ggml_backend_vk_get_device_memory(device, &free, &total); + return free; +#else + return 1; +#endif + GGML_UNUSED(model); + GGML_UNUSED(device); +} + // // kv cache helpers // @@ -4791,13 +4810,13 @@ static bool llm_load_tensors( if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points - int device_count = llama_get_device_count(); + int device_count = llama_get_device_count(model); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - splits[i] = llama_get_device_memory(i); + splits[i] = llama_get_device_memory(model, i); } } else { std::copy(tensor_split, tensor_split + device_count, splits.begin()); @@ -4817,35 +4836,35 @@ static bool llm_load_tensors( int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); - model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu); } // assign the output layer if (n_gpu_layers > n_layer) { int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); - model.buft_output = llama_default_buffer_type_offload(layer_gpu); + model.buft_output = llama_default_buffer_type_offload(model, layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); } } else { ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_MODE_ROW) { - split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split); } else { // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported - split_buft = llama_default_buffer_type_offload(main_gpu); + split_buft = llama_default_buffer_type_offload(model, main_gpu); } // assign the repeating layers for (int64_t i = i_gpu_start; i < n_layer; ++i) { model.buft_layer[i] = { split_buft, - llama_default_buffer_type_offload(main_gpu) + llama_default_buffer_type_offload(model, main_gpu) }; } // assign the output layer if (n_gpu_layers > n_layer) { model.buft_output = { split_buft, - llama_default_buffer_type_offload(main_gpu) + llama_default_buffer_type_offload(model, main_gpu) }; } else { model.buft_output = llama_default_buffer_type_cpu(true); @@ -15390,6 +15409,7 @@ struct llama_model_params llama_model_default_params() { /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, + /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, @@ -15460,7 +15480,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { } size_t llama_max_devices(void) { -#if defined(GGML_USE_METAL) +#if defined(GGML_USE_RPC) + return GGML_RPC_MAX_SERVERS; +#elif defined(GGML_USE_METAL) return 1; #elif defined(GGML_USE_CUDA) return GGML_CUDA_MAX_DEVICES; @@ -15483,7 +15505,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -15546,7 +15568,17 @@ struct llama_model * llama_load_model_from_file( return true; }; } - + if (params.rpc_servers != nullptr) { + // split the servers set them into model->rpc_servers + std::string servers(params.rpc_servers); + size_t pos = 0; + while ((pos = servers.find(",")) != std::string::npos) { + std::string server = servers.substr(0, pos); + model->rpc_servers.push_back(server); + servers.erase(0, pos + 1); + } + model->rpc_servers.push_back(servers); + } int status = llama_model_load(path_model, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { @@ -15693,7 +15725,17 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends -#ifdef GGML_USE_METAL +#if defined(GGML_USE_RPC) + for (auto & server : model->rpc_servers) { + ggml_backend_t backend = ggml_backend_rpc_init(server.c_str()); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } +#elif defined(GGML_USE_METAL) if (model->n_gpu_layers > 0) { ctx->backend_metal = ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { @@ -15850,7 +15892,7 @@ struct llama_context * llama_new_context_with_model( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = - llama_get_device_count() > 1 && + llama_get_device_count(*model) > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER && params.offload_kqv; diff --git a/llama.h b/llama.h index 0b2e708d06dea..612e32c4ea058 100644 --- a/llama.h +++ b/llama.h @@ -242,6 +242,9 @@ extern "C" { // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; + // comma separated list of RPC servers to use for offloading + const char * rpc_servers; + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. // If it returns false, model loading is immediately aborted.