Skip to content

Commit

Permalink
ggml : add RPC backend (#6829)
Browse files Browse the repository at this point in the history
* ggml : add RPC backend

The RPC backend proxies all operations to a remote server which runs a
regular backend (CPU, CUDA, Metal, etc).

* set TCP_NODELAY

* add CI workflows

* Address review comments

* fix warning

* implement llama_max_devices() for RPC

* Address review comments

* Address review comments

* wrap sockfd into a struct

* implement get_alignment and get_max_size

* add get_device_memory

* fix warning

* win32 support

* add README

* readme : trim trailing whitespace

* Address review comments

* win32 fix

* Address review comments

* fix compile warnings on macos
  • Loading branch information
rgerganov authored May 14, 2024
1 parent 5416002 commit 5e31828
Show file tree
Hide file tree
Showing 12 changed files with 1,395 additions and 98 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,36 @@ jobs:
cd build
ctest -L main --verbose
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest

continue-on-error: true

steps:
- name: Clone
id: checkout
uses: actions/checkout@v4

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_RPC=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose
ubuntu-22-cmake-vulkan:
runs-on: ubuntu-22.04

Expand Down Expand Up @@ -663,6 +693,8 @@ jobs:
strategy:
matrix:
include:
- build: 'rpc'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2'
Expand Down
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
Expand Down Expand Up @@ -494,6 +495,17 @@ if (LLAMA_MPI)
endif()
endif()

if (LLAMA_RPC)
add_compile_definitions(GGML_USE_RPC)

if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
endif()

set(GGML_HEADERS_RPC ggml-rpc.h)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
Expand Down
10 changes: 10 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.rpc_servers = argv[i];
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
Expand Down Expand Up @@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
}
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
Expand Down Expand Up @@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers

ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
Expand Down
3 changes: 3 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,7 @@ else()
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif()
2 changes: 2 additions & 0 deletions examples/rpc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)
74 changes: 74 additions & 0 deletions examples/rpc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
## Overview

The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:

```mermaid
flowchart TD
rpcb---|TCP|srva
rpcb---|TCP|srvb
rpcb-.-|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
end
subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
end
subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
end
subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
```

Each host can run a different backend, e.g. one with CUDA and another with Metal.
You can also run multiple `rpc-server` instances on the same host, each with a different backend.

## Usage

On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support:

```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
cmake --build . --config Release
```

Then, start the `rpc-server` with the backend:

```bash
$ bin/rpc-server 0.0.0.0 50052
create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
Starting RPC server on 0.0.0.0:50052
```

When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.


On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:

```bash
mkdir build-rpc
cd build-rpc
cmake .. -DLLAMA_RPC=ON
cmake --build . --config Release
```

Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:

```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```
70 changes: 70 additions & 0 deletions examples/rpc/rpc-server.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif

#include "ggml-rpc.h"
#include <string>
#include <stdio.h>

static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#elif GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
backend = ggml_backend_metal_init();
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif

// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
fprintf(stderr, "%s: using CPU backend\n", __func__);
backend = ggml_backend_cpu_init();
}
return backend;
}

static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else
// TODO: implement for other backends
*free_mem = 1;
*total_mem = 1;
#endif
}

int main(int argc, char * argv[]) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
return 1;
}
const char * host = argv[1];
int port = std::stoi(argv[2]);
if (port <= 0 || port > 65535) {
fprintf(stderr, "Invalid port number: %d\n", port);
return 1;
}
ggml_backend_t backend = create_backend();
if (!backend) {
fprintf(stderr, "Failed to create backend\n");
return 1;
}
printf("Starting RPC server on %s:%d\n", host, port);
size_t free_mem, total_mem;
get_backend_memory(&free_mem, &total_mem);
std::string endpoint = std::string(host) + ":" + std::to_string(port);
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}
Loading

0 comments on commit 5e31828

Please sign in to comment.