-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The purpose of the RPC backend is to proxy all operations to another host where they are implemented with one of the existing backends (e.g. CUDA, Metal, etc.).
- Loading branch information
Showing
9 changed files
with
1,052 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
add_executable(client client.cpp) | ||
target_link_libraries(client PRIVATE ggml) | ||
|
||
add_executable(server server.cpp) | ||
target_link_libraries(server PRIVATE ggml) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
#include "ggml.h" | ||
#include "ggml/ggml-alloc.h" | ||
#include "ggml/ggml-backend.h" | ||
|
||
#include "ggml-rpc.h" | ||
#include <cstdio> | ||
#include <cstdlib> | ||
|
||
// This is a simple model with two tensors a and b | ||
struct simple_model { | ||
struct ggml_tensor * a; | ||
struct ggml_tensor * b; | ||
|
||
// RPC backend | ||
ggml_backend_t backend = NULL; | ||
|
||
// the backend buffer to storage the tensors data of a and b | ||
ggml_backend_buffer_t buffer; | ||
|
||
// the context to define the tensor information (dimensions, size, memory address) | ||
struct ggml_context * ctx; | ||
}; | ||
|
||
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) | ||
{ | ||
int num_tensors = 2; | ||
|
||
struct ggml_init_params params { | ||
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors, | ||
/*.mem_buffer =*/ NULL, | ||
/*.no_alloc =*/ true, | ||
}; | ||
|
||
// create context | ||
model.ctx = ggml_init(params); | ||
|
||
// create tensors | ||
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); | ||
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); | ||
|
||
// create a backend buffer (backend memory) and alloc the tensors from the context | ||
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); | ||
|
||
// load data from cpu memory to backend buffer | ||
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); | ||
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); | ||
} | ||
|
||
// build the compute graph to perform a matrix multiplication | ||
struct ggml_cgraph * build_graph(const simple_model& model) { | ||
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); | ||
static std::vector<uint8_t> buf(buf_size); | ||
|
||
struct ggml_init_params params0 = { | ||
/*.mem_size =*/ buf_size, | ||
/*.mem_buffer =*/ buf.data(), | ||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() | ||
}; | ||
|
||
// create a temporally context to build the graph | ||
struct ggml_context * ctx0 = ggml_init(params0); | ||
|
||
struct ggml_cgraph * gf = ggml_new_graph(ctx0); | ||
|
||
// result = a*b^T | ||
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b); | ||
|
||
// build operations nodes | ||
ggml_build_forward_expand(gf, result); | ||
|
||
// delete the temporally context used to build the graph | ||
ggml_free(ctx0); | ||
return gf; | ||
} | ||
|
||
// compute with backend | ||
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) { | ||
// reset the allocator to free all the memory allocated during the previous inference | ||
|
||
struct ggml_cgraph * gf = build_graph(model); | ||
|
||
// allocate tensors | ||
ggml_gallocr_alloc_graph(allocr, gf); | ||
|
||
ggml_status status = ggml_backend_graph_compute(model.backend, gf); | ||
if (status != GGML_STATUS_SUCCESS) { | ||
fprintf(stderr, "%s: ggml_backend_graph_compute() failed\n", __func__); | ||
exit(1); | ||
} | ||
|
||
// in this case, the output tensor is the last one in the graph | ||
return gf->nodes[gf->n_nodes - 1]; | ||
} | ||
|
||
int main(int argc, char * argv[]) | ||
{ | ||
if (argc < 2) { | ||
fprintf(stderr, "Usage: %s <server_addr>\n", argv[0]); | ||
return 1; | ||
} | ||
ggml_time_init(); | ||
|
||
// initialize data of matrices to perform matrix multiplication | ||
const int rows_A = 4, cols_A = 2; | ||
|
||
float matrix_A[rows_A * cols_A] = { | ||
2, 8, | ||
5, 1, | ||
4, 2, | ||
8, 6 | ||
}; | ||
|
||
const int rows_B = 3, cols_B = 2; | ||
/* Transpose([ | ||
10, 9, 5, | ||
5, 9, 4 | ||
]) 2 rows, 3 cols */ | ||
float matrix_B[rows_B * cols_B] = { | ||
10, 5, | ||
9, 9, | ||
5, 4 | ||
}; | ||
|
||
simple_model model; | ||
model.backend = ggml_backend_rpc_init(argv[1]); | ||
if (!model.backend) { | ||
fprintf(stderr, "%s: ggml_backend_rpc_init() failed\n", __func__); | ||
exit(1); | ||
} | ||
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); | ||
|
||
// calculate the temporaly memory required to compute | ||
ggml_gallocr_t allocr = NULL; | ||
|
||
{ | ||
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); | ||
|
||
// create the worst case graph for memory usage estimation | ||
struct ggml_cgraph * gf = build_graph(model); | ||
ggml_gallocr_reserve(allocr, gf); | ||
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); | ||
|
||
fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); | ||
} | ||
|
||
// perform computation | ||
struct ggml_tensor * result = compute(model, allocr); | ||
|
||
// create a array to print result | ||
std::vector<float> out_data(ggml_nelements(result)); | ||
|
||
// bring the data from the backend memory | ||
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result)); | ||
|
||
// expected result: | ||
// [ 60.00 110.00 54.00 29.00 | ||
// 55.00 90.00 126.00 28.00 | ||
// 50.00 54.00 42.00 64.00 ] | ||
|
||
printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]); | ||
for (int j = 0; j < result->ne[1] /* rows */; j++) { | ||
if (j > 0) { | ||
printf("\n"); | ||
} | ||
|
||
for (int i = 0; i < result->ne[0] /* cols */; i++) { | ||
printf(" %.2f", out_data[i * result->ne[1] + j]); | ||
} | ||
} | ||
printf(" ]\n"); | ||
|
||
// release backend memory used for computation | ||
ggml_gallocr_free(allocr); | ||
|
||
// free memory | ||
ggml_free(model.ctx); | ||
|
||
// release backend memory and free backend | ||
ggml_backend_buffer_free(model.buffer); | ||
ggml_backend_free(model.backend); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#include <memory> | ||
#include <string> | ||
#include <grpcpp/ext/proto_server_reflection_plugin.h> | ||
#include <grpcpp/grpcpp.h> | ||
#include <grpcpp/health_check_service_interface.h> | ||
|
||
#include "ggml-rpc.h" | ||
|
||
int main(int argc, char * argv[]) | ||
{ | ||
if (argc < 2) { | ||
fprintf(stderr, "Usage: %s <port>\n", argv[0]); | ||
return 1; | ||
} | ||
int port = std::stoi(argv[1]); | ||
std::string server_address = "0.0.0.0:" + std::to_string(port); | ||
BackendImpl service; | ||
|
||
grpc::EnableDefaultHealthCheckService(true); | ||
grpc::reflection::InitProtoReflectionServerBuilderPlugin(); | ||
grpc::ServerBuilder builder; | ||
// Listen on the given address without any authentication mechanism. | ||
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); | ||
builder.RegisterService(&service); | ||
std::unique_ptr<grpc::Server> server(builder.BuildAndStart()); | ||
std::cout << "RPC backend listening on " << server_address << std::endl; | ||
|
||
// Wait for the server to shutdown. Note that some other thread must be | ||
// responsible for shutting down the server for this call to ever return. | ||
server->Wait(); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.