Add RPC backend

The purpose of the RPC backend is to proxy all operations to another host where they are implemented with one of the existing backends (e.g. CUDA, Metal, etc.).
ggerganov · Mar 11, 2024 · a066e35 · a066e35
1 parent 43a6d4a
commit a066e35
Show file tree

Hide file tree

Showing 9 changed files with 1,052 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -46,6 +46,7 @@ option(GGML_CLBLAST                 "ggml: use clBLAST"                  OFF)
 option(GGML_HIPBLAS                 "ggml: use hipBLAS"                  OFF)
 option(GGML_CUBLAS                  "ggml: use cuBLAS"                   OFF)
 option(GGML_METAL                   "ggml: use Metal"                    OFF)
+option(GGML_RPC                     "ggml: use RPC"                      OFF)
 
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -26,3 +26,6 @@ add_subdirectory(sam)
 add_subdirectory(yolo)
 add_subdirectory(simple)
 add_subdirectory(magika)
+if (GGML_RPC)
+  add_subdirectory(rpc)
+endif()
diff --git a/examples/rpc/CMakeLists.txt b/examples/rpc/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(client client.cpp)
+target_link_libraries(client PRIVATE ggml)
+
+add_executable(server server.cpp)
+target_link_libraries(server PRIVATE ggml)
diff --git a/examples/rpc/client.cpp b/examples/rpc/client.cpp
@@ -0,0 +1,182 @@
+#include "ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#include "ggml-rpc.h"
+#include <cstdio>
+#include <cstdlib>
+
+// This is a simple model with two tensors a and b
+struct simple_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+
+    // RPC backend
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
+
+    // the context to define the tensor information (dimensions, size, memory address)
+    struct ggml_context * ctx;
+};
+
+void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B)
+{
+    int num_tensors = 2;
+
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
+
+    // create a backend buffer (backend memory) and alloc the tensors from the context
+    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+
+    // load data from cpu memory to backend buffer
+    ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
+    ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+}
+
+// build the compute graph to perform a matrix multiplication
+struct ggml_cgraph * build_graph(const simple_model& model) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    // result = a*b^T
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
+
+    // build operations nodes
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+// compute with backend
+struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
+    // reset the allocator to free all the memory allocated during the previous inference
+
+    struct ggml_cgraph * gf = build_graph(model);
+
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        fprintf(stderr, "%s: ggml_backend_graph_compute() failed\n", __func__);
+        exit(1);
+    }
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+int main(int argc, char * argv[])
+{
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <server_addr>\n", argv[0]);
+        return 1;
+    }
+    ggml_time_init();
+
+    // initialize data of matrices to perform matrix multiplication
+    const int rows_A = 4, cols_A = 2;
+
+    float matrix_A[rows_A * cols_A] = {
+        2, 8,
+        5, 1,
+        4, 2,
+        8, 6
+    };
+
+    const int rows_B = 3, cols_B = 2;
+    /* Transpose([
+        10, 9, 5,
+        5, 9, 4
+    ]) 2 rows, 3 cols */
+    float matrix_B[rows_B * cols_B] = {
+        10, 5,
+        9, 9,
+        5, 4
+    };
+
+    simple_model model;
+    model.backend = ggml_backend_rpc_init(argv[1]);
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_rpc_init() failed\n", __func__);
+        exit(1);
+    }
+    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
+
+    // calculate the temporaly memory required to compute
+    ggml_gallocr_t allocr = NULL;
+
+    {
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+
+        // create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = build_graph(model);
+        ggml_gallocr_reserve(allocr, gf);
+        size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+
+        fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+    }
+
+    // perform computation
+    struct ggml_tensor * result = compute(model, allocr);
+
+    // create a array to print result
+    std::vector<float> out_data(ggml_nelements(result));
+
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
+
+    // expected result:
+    // [ 60.00 110.00 54.00 29.00
+    //  55.00 90.00 126.00 28.00
+    //  50.00 54.00 42.00 64.00 ]
+
+    printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]);
+    for (int j = 0; j < result->ne[1] /* rows */; j++) {
+        if (j > 0) {
+            printf("\n");
+        }
+
+        for (int i = 0; i < result->ne[0] /* cols */; i++) {
+            printf(" %.2f", out_data[i * result->ne[1] + j]);
+        }
+    }
+    printf(" ]\n");
+
+    // release backend memory used for computation
+    ggml_gallocr_free(allocr);
+
+    // free memory
+    ggml_free(model.ctx);
+
+    // release backend memory and free backend
+    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_free(model.backend);
+    return 0;
+}
diff --git a/examples/rpc/server.cpp b/examples/rpc/server.cpp
@@ -0,0 +1,33 @@
+#include <memory>
+#include <string>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+
+#include "ggml-rpc.h"
+
+int main(int argc, char * argv[])
+{
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <port>\n", argv[0]);
+        return 1;
+    }
+    int port = std::stoi(argv[1]);
+    std::string server_address = "0.0.0.0:" + std::to_string(port);
+    BackendImpl service;
+
+    grpc::EnableDefaultHealthCheckService(true);
+    grpc::reflection::InitProtoReflectionServerBuilderPlugin();
+    grpc::ServerBuilder builder;
+    // Listen on the given address without any authentication mechanism.
+    builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+    builder.RegisterService(&service);
+    std::unique_ptr<grpc::Server> server(builder.BuildAndStart());
+    std::cout << "RPC backend listening on " << server_address << std::endl;
+
+    // Wait for the server to shutdown. Note that some other thread must be
+    // responsible for shutting down the server for this call to ever return.
+    server->Wait();
+
+    return 0;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -320,6 +320,43 @@ if (GGML_METAL)
         )
 endif()
 
+if (GGML_RPC)
+    find_package(protobuf CONFIG REQUIRED)
+    message(STATUS "Using protobuf ${Protobuf_VERSION}")
+    set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+
+    find_package(gRPC CONFIG REQUIRED)
+    message(STATUS "Using gRPC ${gRPC_VERSION}")
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+
+    # Proto file
+    get_filename_component(ggml_proto "ggml-rpc.proto" ABSOLUTE)
+    get_filename_component(ggml_proto_path "${ggml_proto}" PATH)
+
+    # Generated sources
+    set(ggml_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.pb.cc")
+    set(ggml_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.pb.h")
+    set(ggml_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.grpc.pb.cc")
+    set(ggml_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/ggml-rpc.grpc.pb.h")
+
+    add_custom_command(
+        OUTPUT "${ggml_proto_srcs}" "${ggml_proto_hdrs}" "${ggml_grpc_srcs}" "${ggml_grpc_hdrs}"
+        COMMAND ${_PROTOBUF_PROTOC}
+        ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+            --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+            -I "${ggml_proto_path}"
+            --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+            "${ggml_proto}"
+        DEPENDS "${ggml_proto}")
+
+    # Include generated *.pb.h files
+    set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${CMAKE_CURRENT_BINARY_DIR})
+
+    set(GGML_RPC_SOURCES ggml-rpc.cpp ggml-rpc.h ${ggml_grpc_srcs} ${ggml_grpc_hdrs} ${ggml_proto_srcs} ${ggml_proto_hdrs})
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} gRPC::grpc++ gRPC::grpc gRPC::grpc++_reflection ${_PROTOBUF_LIBPROTOBUF})
+endif()
+
 if (GGML_PERF)
     set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
 endif()
@@ -337,6 +374,7 @@ add_library(${TARGET}
     ${GGML_CUDA_SOURCES}
     ${GGML_OPENCL_SOURCES}
     ${GGML_METAL_SOURCES}
+    ${GGML_RPC_SOURCES}
     )
 
 target_include_directories(${TARGET} PUBLIC