Skip to content

Commit

Permalink
add LLAMA_SCHED_MAX_COPIES to configure the number of input copies fo…
Browse files Browse the repository at this point in the history
…r pipeline parallelism

default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage
  • Loading branch information
slaren committed Mar 12, 2024
1 parent 00a415d commit 89bfa1f
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL"
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag)

add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})

# enable libstdc++ assertions for debug builds
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
MK_CPPFLAGS += -D_BSD_SOURCE
endif

ifdef LLAMA_SCHED_MAX_COPIES
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
endif

ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
Expand Down
71 changes: 43 additions & 28 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)

// backend registry

#define GGML_MAX_BACKENDS_REG 16
#define GGML_REG_MAX_BACKENDS 16

struct ggml_backend_reg {
char name[128];
Expand All @@ -396,7 +396,7 @@ struct ggml_backend_reg {
void * user_data;
};

static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
static size_t ggml_backend_registry_count = 0;

GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
Expand Down Expand Up @@ -441,7 +441,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
}

GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);

size_t id = ggml_backend_registry_count;

Expand Down Expand Up @@ -993,16 +993,27 @@ static bool ggml_is_view_op(enum ggml_op op) {

// scheduler

#define GGML_MAX_BACKENDS 16
#define GGML_MAX_SPLITS 256
#define GGML_MAX_SPLIT_INPUTS 16
#define GGML_MAX_COPIES 2
#ifndef GGML_SCHED_MAX_BACKENDS
#define GGML_SCHED_MAX_BACKENDS 16
#endif

#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 256
#endif

#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
#endif

#ifndef GGML_SCHED_MAX_COPIES
#define GGML_SCHED_MAX_COPIES 4
#endif

struct ggml_backend_sched_split {
int backend_id;
int i_start;
int i_end;
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
int n_inputs;
// graph view of this split
struct ggml_cgraph graph;
Expand All @@ -1014,15 +1025,15 @@ struct ggml_backend_sched {

int n_backends;

ggml_backend_t backends[GGML_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
ggml_gallocr_t galloc;

// hash keys of the nodes in the graph
struct ggml_hash_set hash_set;
// hash values
int * tensor_backend_id;
struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];

int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size]
Expand All @@ -1031,14 +1042,14 @@ struct ggml_backend_sched {
struct ggml_cgraph * graph;

// graph splits
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
int n_splits;

// pipeline parallelism support
int n_copies;
int cur_copy;
ggml_backend_event_t events[GGML_MAX_BACKENDS][GGML_MAX_COPIES];
struct ggml_tensor * graph_inputs[GGML_MAX_SPLIT_INPUTS];
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
int n_graph_inputs;

struct ggml_context * ctx;
Expand All @@ -1047,12 +1058,12 @@ struct ggml_backend_sched {
void * callback_eval_user_data;

// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
#else
#else
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
#endif
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
};

#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
Expand Down Expand Up @@ -1089,7 +1100,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
}

#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
Expand Down Expand Up @@ -1395,7 +1406,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (tensor_backend_id != cur_backend_id) {
sched->splits[cur_split].i_end = i;
cur_split++;
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
sched->splits[cur_split].backend_id = tensor_backend_id;
sched->splits[cur_split].i_start = i;
sched->splits[cur_split].n_inputs = 0;
Expand Down Expand Up @@ -1433,7 +1444,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
GGML_ASSERT(n_graph_inputs < GGML_MAX_SPLIT_INPUTS);
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->graph_inputs[n_graph_inputs] = src;
}
}
Expand All @@ -1455,7 +1466,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
}
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
Expand Down Expand Up @@ -1507,7 +1518,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

// create copies of the graph for each split
// TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
Expand Down Expand Up @@ -1683,23 +1694,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size,
bool parallel) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU

struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);

// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);

sched->n_backends = n_backends;

sched->n_copies = parallel ? GGML_MAX_COPIES : 1;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

GGML_ASSERT(sched->n_copies <= GGML_MAX_COPIES);
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);

for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
Expand Down Expand Up @@ -1764,7 +1775,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}

bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);

ggml_backend_sched_split_graph(sched, graph);

Expand Down Expand Up @@ -1812,6 +1823,10 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
return sched->n_splits;
}

int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
return sched->n_copies;
}

size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
Expand Down
1 change: 1 addition & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ extern "C" {

// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

Expand Down
5 changes: 3 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13005,10 +13005,11 @@ struct llama_context * llama_new_context_with_model(
// currently this is only implemented in the CUDA backend
pipeline_parallel = false;
#endif
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);

if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
}
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);

// build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
Expand Down

0 comments on commit 89bfa1f

Please sign in to comment.