From dff8cbeb398e290c4ee6bc151dc1145f470fd4ab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 10:51:58 +0200 Subject: [PATCH 01/47] convert : support Mixtral as LLAMA arch --- convert.py | 13 ++++++++++++- gguf-py/gguf/constants.py | 14 +++++++++++++- gguf-py/gguf/tensor_mapping.py | 35 ++++++++++++++++++++++++++-------- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/convert.py b/convert.py index 6e95d6cb37e79..368b3d7f709e9 100755 --- a/convert.py +++ b/convert.py @@ -266,12 +266,23 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: # LLaMA v1 n_ctx = 2048 + # print model keys + for k in model.keys(): + print(k) + + # check if MoE + if "layers.0.feed_forward.experts.0.w1.weight" in model: + n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] + n_ctx = 32768 + else: + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], + return Params( n_vocab = model["tok_embeddings.weight"].shape[0], n_embd = config["dim"], n_layer = config["n_layers"], n_ctx = n_ctx, - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], + n_ff = n_ff, n_head = (n_head := config["n_heads"]), n_head_kv = config.get("n_kv_heads", n_head), f_norm_eps = config["norm_eps"], diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 685c88f1a3397..59c2d24e21261 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -111,10 +111,14 @@ class MODEL_TENSOR(IntEnum): ATTN_NORM = auto() ATTN_NORM_2 = auto() ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_NORM = auto() FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() - FFN_NORM = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() @@ -154,10 +158,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -172,10 +180,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, ], MODEL_ARCH.GPTNEOX: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index cc6236014eb72..18f75cf69eeda 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -149,6 +149,10 @@ class TensorNameMap: "model.layers.{bid}.ln2", # yi ), + MODEL_TENSOR.FFN_GATE_INP: ( + "layers.{bid}.feed_forward.gate", # mixtral + ), + # Feed-forward up MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox @@ -164,11 +168,19 @@ class TensorNameMap: "transformer.h.{bid}.mlp.w1", # qwen ), + MODEL_TENSOR.FFN_UP_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + ), + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + ), + + MODEL_TENSOR.FFN_GATE_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral ), # Feed-forward down @@ -185,6 +197,10 @@ class TensorNameMap: "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon ), + MODEL_TENSOR.FFN_DOWN_EXP: ( + "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + ), + MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", ), @@ -213,11 +229,14 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): for tensor, keys in self.block_mappings_cfg.items(): if tensor not in MODEL_TENSORS[arch]: continue - tensor_name = TENSOR_NAMES[tensor].format(bid = bid) - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - key = key.format(bid = bid) - self.mapping[key] = (tensor, tensor_name) + # TODO: make this configurable + n_experts = 8 + for xid in range(n_experts): + tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) + self.mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + key = key.format(bid = bid, xid = xid) + self.mapping[key] = (tensor, tensor_name) def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) From d38e41ee69c47482e5e625af12bf5bf3a59f5212 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 10:59:37 +0200 Subject: [PATCH 02/47] convert : fix n_ff typo --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 368b3d7f709e9..7cc59a8437e6c 100755 --- a/convert.py +++ b/convert.py @@ -275,7 +275,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] n_ctx = 32768 else: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] return Params( n_vocab = model["tok_embeddings.weight"].shape[0], From a3eefe95a8fec1a400697e9c1aa5b34a9cb2c194 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 11:14:03 +0200 Subject: [PATCH 03/47] llama : model loading --- llama.cpp | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index b12bbd1b05442..741b0f0bdf8e7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -338,10 +338,14 @@ enum llm_tensor { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, - LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_GATE_EXP, + LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, }; @@ -360,10 +364,14 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, { @@ -585,6 +593,10 @@ struct LLM_TN { std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; } + + std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; + } }; // @@ -1268,6 +1280,12 @@ struct llama_layer { struct ggml_tensor * ffn_down; // w2 struct ggml_tensor * ffn_up; // w3 + // ff MoE + struct ggml_tensor * ffn_gate_inp; + struct ggml_tensor * ffn_gate_exp[8]; + struct ggml_tensor * ffn_down_exp[8]; + struct ggml_tensor * ffn_up_exp[8]; + // ff bias struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 @@ -3025,9 +3043,20 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); + + if (layer.ffn_gate_inp == nullptr) { + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + } else { + // MoE branch + for (int x = 0; x < 8; ++x) { + layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); + layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); + layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); + } + } if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -3037,8 +3066,18 @@ static void llm_load_tensors( (layer.bk ? ggml_nbytes(layer.bk) : 0) + (layer.bv ? ggml_nbytes(layer.bv) : 0) + (layer.bo ? ggml_nbytes(layer.bo) : 0) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + ggml_nbytes(layer.ffn_norm); + + if (layer.ffn_gate_inp == nullptr) { + vram_weights += + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + } else { + vram_weights += ggml_nbytes(layer.ffn_gate_inp); + for (int x = 0; x < 8; ++x) { + vram_weights += + ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]); + } + } } } } break; From 861cd678992a937353723fa47b21d5bf613813f5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 11:19:46 +0200 Subject: [PATCH 04/47] ggml : sync latest ggml_mul_mat_id --- ggml-cuda.cu | 65 ++++++++++++++++++++++++-------------- ggml-metal.m | 21 ++++++++---- ggml.c | 61 +++++++++++++++++++++-------------- tests/test-backend-ops.cpp | 34 ++++++++++---------- 4 files changed, 110 insertions(+), 71 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 85f7a293783be..c1c7c30e56230 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,13 +1,15 @@ #include +#include +#include +#include #include #include -#include #include #include #include #include -#include -#include +#include + #if defined(GGML_USE_HIPBLAS) #include @@ -8234,36 +8236,51 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { } #endif -static void ggml_cuda_mul_mat_id(const ggml_tensor * _src0, const ggml_tensor * _src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #if 0 -//#ifdef CUDA_USE_TENSOR_CORES -// const bool use_tensor_cores = true; -//#else -// const bool use_tensor_cores = false; -//#endif - ggml_cuda_mul_mat_id_cublas(dst); - // TODO: mmq/mmv support -#else - const struct ggml_tensor * ids = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const int id = dst->op_params[0]; +#endif - int32_t * ids_dev = (int32_t *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + const struct ggml_tensor * ids = src0; + const int32_t id = dst->op_params[0]; + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - int32_t a_id; - CUDA_CHECK(cudaMemcpyAsync(&a_id, ids_dev + id, sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + std::vector ids_host(ggml_nbytes(ids)); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]); - const struct ggml_tensor * src0 = dst->src[a_id + 2]; + const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; + const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; - ggml_cuda_mul_mat(src0, src1, dst); -#endif + ggml_tensor_extra_gpu src1_row_extra; + ggml_tensor_extra_gpu dst_row_extra; + + ggml_tensor src1_row = *src1; + ggml_tensor dst_row = *dst; + + src1_row.ne[1] = 1; + dst_row.ne[1] = 1; + + src1_row.extra = &src1_row_extra; + dst_row.extra = &dst_row_extra; - (void) _src0; - (void) _src1; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; + dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } } static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml-metal.m b/ggml-metal.m index f9bd69dc84bbe..595bb6c0f5c43 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -177,6 +177,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); } else { char* buffer2 = malloc(len+1); + va_end(args); + va_start(args, format); vsnprintf(buffer2, len+1, format, args); buffer2[len] = 0; ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data); @@ -1193,7 +1195,9 @@ void ggml_metal_graph_compute( const float scale = ((float *) dst->op_params)[0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + if (id_src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; @@ -1511,9 +1515,7 @@ void ggml_metal_graph_compute( case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; @@ -1523,7 +1525,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; [encoder setBytes:&idx length:sizeof(idx) atIndex:15]; @@ -1538,7 +1540,14 @@ void ggml_metal_graph_compute( } [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + + for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) { + [encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } } } break; case GGML_OP_GET_ROWS: diff --git a/ggml.c b/ggml.c index ca56f063c3a87..40c389661d93d 100644 --- a/ggml.c +++ b/ggml.c @@ -4083,7 +4083,9 @@ struct ggml_tensor * ggml_mul_mat_id( int64_t n_as = ids->ne[0]; GGML_ASSERT(ids->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_vector(ids)); + GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); + GGML_ASSERT(ids->ne[1] == b->ne[1]); + GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); GGML_ASSERT(id >= 0 && id < n_as); @@ -9519,11 +9521,16 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif +// off1 = offset in i11 and i1 +// cne1 = ne11 and ne1 +// in a normal matrix multiplication, off1 = 0 and cne1 = ne1 +// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1 static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst, + int64_t off1, int64_t cne1) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9591,10 +9598,9 @@ static void ggml_compute_forward_mul_mat( const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -9611,10 +9617,10 @@ static void ggml_compute_forward_mul_mat( } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); + cne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); } } @@ -9630,6 +9636,7 @@ static void ggml_compute_forward_mul_mat( const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -9652,7 +9659,7 @@ static void ggml_compute_forward_mul_mat( const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = ne11*ne12*ne13; // src1 rows + const int64_t nr1 = cne1*ne12*ne13; // src1 rows //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); @@ -9694,9 +9701,9 @@ static void ggml_compute_forward_mul_mat( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*ne11)); - const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; - const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + const int64_t i13 = (ir1/(ne12*cne1)); + const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; + const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; // broadcast src0 into src1 const int64_t i03 = i13/r3; @@ -9736,20 +9743,26 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const struct ggml_tensor * ids = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type + ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]); + return; + } + const struct ggml_tensor * ids = src0; const int id = ggml_get_op_params_i32(dst, 0); - const int a_id = ((int32_t *)ids->data)[id]; - - GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); - const struct ggml_tensor * src0 = dst->src[a_id + 2]; - - ggml_compute_forward_mul_mat(params, src0, src1, dst); + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); + } } // ggml_compute_forward_out_prod @@ -14037,11 +14050,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); } break; case GGML_OP_MUL_MAT_ID: { - ggml_compute_forward_mul_mat_id(params, tensor); + ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_OUT_PROD: { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e0155ac1c8913..5b1b8cb7c7d48 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -770,11 +770,9 @@ struct test_mul_mat_id : public test_case { const int64_t m; const int64_t n; const int64_t k; - const std::array bs; // dims 3 and 4 - const std::array nr; // repeat in dims 3 and 4 std::string vars() override { - return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr); + return VARS_TO_STR7(type_a, type_b, n_mats, id, m, n, k); } double max_nmse_err() override { @@ -782,7 +780,7 @@ struct test_mul_mat_id : public test_case { } size_t op_size(ggml_tensor * t) override { - size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1]; + size_t a = ggml_nbytes(t->src[2]) * n; size_t b = ggml_nbytes(t->src[1]) * m; size_t c = ggml_nbytes(t); return a + b + c; @@ -792,35 +790,37 @@ struct test_mul_mat_id : public test_case { test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int n_mats = 2, int id = 0, - int64_t m = 32, int64_t n = 32, int64_t k = 32, - std::array bs = {10, 10}, - std::array nr = {2, 2}) + int64_t m = 32, int64_t n = 32, int64_t k = 32) : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id), - m(m), n(n), k(k), bs(bs), nr(nr) {} + m(m), n(n), k(k) {} ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) std::vector mats; for (int i = 0; i < n_mats; i++) { - ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]); + ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m); mats.push_back(a); } - ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats); - ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); + ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); + ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b); return out; } void initialize_tensors(ggml_context * ctx) override { + std::random_device rd; + std::default_random_engine rng(rd()); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (t->type == GGML_TYPE_I32) { // ids - std::vector data(n_mats); - for (int i = 0; i < n_mats; i++) { - data[i] = i; + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); } - std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()())); - ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int)); } else { init_tensor_uniform(t); } @@ -1215,7 +1215,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (int n_mats : {1, 2, 4}) { for (int id = 0; id < n_mats; id++) { - test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256)); } } } From aedfad120aee72b8a294ab46e53563e6e2edfb33 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 11:47:40 +0200 Subject: [PATCH 05/47] llama : update graph to support MoE --- llama.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 741b0f0bdf8e7..e4d1a530a84f4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4223,7 +4223,7 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - { + if (model.layers[il].ffn_gate_inp == nullptr) { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); @@ -4235,6 +4235,51 @@ struct llm_build_context { model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + const int n_experts_per_tok = 2; // TODO: param + + ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1] + weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1] + + // compute expert outputs + ggml_tensor * moe_out; + + for (int i = 0; i < n_experts_per_tok; ++i) { + ggml_tensor * cur_expert; + + // TODO: fix + ggml_tensor ** ffn_up_exp = (ggml_tensor **) model.layers[il].ffn_up_exp; + ggml_tensor ** ffn_gate_exp = (ggml_tensor **) model.layers[il].ffn_gate_exp; + ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; + + cur_expert = ggml_mul(ctx0, + ggml_mul_mat_id(ctx0, ffn_up_exp, selected_experts, i, cur), + ggml_silu(ctx0, + ggml_mul_mat_id(ctx0, ffn_gate_exp, selected_experts, i, cur))); // [n_tokens, n_embd] + + cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul(ctx0, cur, + ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + } + } + + cur = moe_out; } cur = ggml_add(ctx0, cur, ffn_inp); From af1a096bf87e7b17f9ea53fae8ad45a5e3caf09f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 12:07:39 +0200 Subject: [PATCH 06/47] llama : fix cur -> cur_expert --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e4d1a530a84f4..3320c781f0dbc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4269,7 +4269,7 @@ struct llm_build_context { ggml_mul_mat_id(ctx0, ffn_gate_exp, selected_experts, i, cur))); // [n_tokens, n_embd] cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cur_expert = ggml_mul(ctx0, cur, + cur_expert = ggml_mul(ctx0, cur_expert, ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); if (i == 0) { From 7ea36953ba278484c0aa5f5e6df210ce6a24aad0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 12:45:15 +0200 Subject: [PATCH 07/47] llama : first working version --- ggml.c | 9 +++++++-- ggml.h | 2 +- llama.cpp | 8 ++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 40c389661d93d..322d0c850af51 100644 --- a/ggml.c +++ b/ggml.c @@ -4105,7 +4105,9 @@ struct ggml_tensor * ggml_mul_mat_id( result->src[0] = ids; result->src[1] = b; - for (int64_t i = 0; i < n_as; i++) { + // TODO: n_as is the selected experts, but it should be the total number of experts + //for (int64_t i = 0; i < n_as; i++) { + for (int64_t i = 0; i < 8; i++) { struct ggml_tensor * a = as[i]; GGML_ASSERT(ggml_are_same_shape(as[0], a)); GGML_ASSERT(ggml_can_mul_mat(a, b)); @@ -9758,7 +9760,10 @@ static void ggml_compute_forward_mul_mat_id( for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + + // TODO: this assert seems wrong? + //printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id); + //GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); const struct ggml_tensor * src0_row = dst->src[row_id + 2]; ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); diff --git a/ggml.h b/ggml.h index a8f10cbd5c1d8..e0cb3b99bf853 100644 --- a/ggml.h +++ b/ggml.h @@ -217,7 +217,7 @@ #define GGML_MAX_DIMS 4 #define GGML_MAX_PARAMS 1024 #define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_SRC 6 +#define GGML_MAX_SRC 10 #define GGML_MAX_NAME 64 #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 diff --git a/llama.cpp b/llama.cpp index 3320c781f0dbc..6333af4aa2b37 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4242,14 +4242,18 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - const int n_experts_per_tok = 2; // TODO: param + // TODO: param + const int n_experts = 8; + const int n_experts_per_tok = 2; ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] // select experts ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] - ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1] + //ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1] + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1] // compute expert outputs From 8b185b703020e09ab9cac8c56832d93aa240e4d9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 13:01:42 +0200 Subject: [PATCH 08/47] llama : fix expert weighting in the FFN --- llama.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6333af4aa2b37..3c4da6a1c3f60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4250,11 +4250,13 @@ struct llm_build_context { ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] - //ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [n_tokens, num_experts_per_tok, 1] - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); - weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok, 1] + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + ggml_tensor * weights = + ggml_reshape_2d(ctx0, + ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts), + n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] + weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok] // compute expert outputs ggml_tensor * moe_out; From 7372b6227183e37f7aa8737188f71a728c716244 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 13:18:58 +0200 Subject: [PATCH 09/47] ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only) --- ggml.c | 17 +++++++++-------- ggml.h | 1 + 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index 322d0c850af51..1c81e79126a62 100644 --- a/ggml.c +++ b/ggml.c @@ -4735,7 +4735,8 @@ struct ggml_tensor * ggml_get_rows( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(ggml_is_matrix(b) && b->type == GGML_TYPE_I32); bool is_node = false; @@ -4745,7 +4746,7 @@ struct ggml_tensor * ggml_get_rows( // TODO: implement non F32 return //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1]); result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -10348,8 +10349,8 @@ static void ggml_compute_forward_get_rows_q( const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); + assert( dst->ne[0] == nc); + assert(ggml_nrows(dst) == nr); assert(src0->nb[0] == ggml_type_size(type)); for (int i = 0; i < nr; ++i) { @@ -10375,8 +10376,8 @@ static void ggml_compute_forward_get_rows_f16( const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); + assert( dst->ne[0] == nc); + assert(ggml_nrows(dst) == nr); assert(src0->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < nr; ++i) { @@ -10403,8 +10404,8 @@ static void ggml_compute_forward_get_rows_f32( const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); - assert( dst->ne[0] == nc); - assert( dst->ne[1] == nr); + assert( dst->ne[0] == nc); + assert(ggml_nrows(dst) == nr); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < nr; ++i) { diff --git a/ggml.h b/ggml.h index e0cb3b99bf853..b154b6dae5184 100644 --- a/ggml.h +++ b/ggml.h @@ -1263,6 +1263,7 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // supports 3D: a->ne[2] == b->ne[1] GGML_API struct ggml_tensor * ggml_get_rows( struct ggml_context * ctx, struct ggml_tensor * a, From ee8fb399aa0cecaf6c0e6ce56d89184fd166191f Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 12:42:25 +0100 Subject: [PATCH 10/47] ggml : add n_as argument to ggml_mul_mat_id --- ggml-cuda.cu | 4 +++- ggml-metal.m | 2 +- ggml.c | 14 ++++++-------- ggml.h | 1 + llama.cpp | 6 +++--- tests/test-backend-ops.cpp | 4 +++- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c1c7c30e56230..04a5d2078941b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8244,6 +8244,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const struct ggml_tensor * ids = src0; const int32_t id = dst->op_params[0]; + const int32_t n_as = dst->op_params[1]; + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; std::vector ids_host(ggml_nbytes(ids)); @@ -8272,7 +8274,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + GGML_ASSERT(row_id >= 0 && row_id < n_as); const struct ggml_tensor * src0_row = dst->src[row_id + 2]; diff --git a/ggml-metal.m b/ggml-metal.m index 595bb6c0f5c43..8389373a85583 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1460,7 +1460,7 @@ void ggml_metal_graph_compute( GGML_ASSERT(src0t == GGML_TYPE_I32); - const int n_as = ne00; + const int n_as = ((int32_t *) dst->op_params)[1]; // TODO: make this more general GGML_ASSERT(n_as <= 8); diff --git a/ggml.c b/ggml.c index 1c81e79126a62..9982c2dade94e 100644 --- a/ggml.c +++ b/ggml.c @@ -4076,12 +4076,11 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as[], + int n_as, struct ggml_tensor * ids, int id, struct ggml_tensor * b) { - int64_t n_as = ids->ne[0]; - GGML_ASSERT(ids->type == GGML_TYPE_I32); GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); GGML_ASSERT(ids->ne[1] == b->ne[1]); @@ -4099,6 +4098,7 @@ struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); ggml_set_op_params_i32(result, 0, id); + ggml_set_op_params_i32(result, 1, n_as); result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4106,8 +4106,7 @@ struct ggml_tensor * ggml_mul_mat_id( result->src[1] = b; // TODO: n_as is the selected experts, but it should be the total number of experts - //for (int64_t i = 0; i < n_as; i++) { - for (int64_t i = 0; i < 8; i++) { + for (int i = 0; i < n_as; i++) { struct ggml_tensor * a = as[i]; GGML_ASSERT(ggml_are_same_shape(as[0], a)); GGML_ASSERT(ggml_can_mul_mat(a, b)); @@ -9757,14 +9756,13 @@ static void ggml_compute_forward_mul_mat_id( } const struct ggml_tensor * ids = src0; - const int id = ggml_get_op_params_i32(dst, 0); + const int id = ggml_get_op_params_i32(dst, 0); + const int n_as = ggml_get_op_params_i32(dst, 1); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); - // TODO: this assert seems wrong? - //printf("row_id = %d, ids->ne[0] = %d, id = %d\n", row_id, ids->ne[0], id); - //GGML_ASSERT(row_id >= 0 && row_id < ids->ne[0]); + GGML_ASSERT(row_id >= 0 && row_id < n_as); const struct ggml_tensor * src0_row = dst->src[row_id + 2]; ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); diff --git a/ggml.h b/ggml.h index b154b6dae5184..bb09160b91fd6 100644 --- a/ggml.h +++ b/ggml.h @@ -1052,6 +1052,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as[], + int n_as, struct ggml_tensor * ids, int id, struct ggml_tensor * b); diff --git a/llama.cpp b/llama.cpp index 3c4da6a1c3f60..3b2a6797971dd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4270,11 +4270,11 @@ struct llm_build_context { ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; cur_expert = ggml_mul(ctx0, - ggml_mul_mat_id(ctx0, ffn_up_exp, selected_experts, i, cur), + ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur), ggml_silu(ctx0, - ggml_mul_mat_id(ctx0, ffn_gate_exp, selected_experts, i, cur))); // [n_tokens, n_embd] + ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur))); // [n_tokens, n_embd] - cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] cur_expert = ggml_mul(ctx0, cur_expert, ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 5b1b8cb7c7d48..dddc2b899138a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -343,6 +343,8 @@ struct test_case { ud->ok = false; } return true; + + GGML_UNUSED(index); }; ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); @@ -803,7 +805,7 @@ struct test_mul_mat_id : public test_case { } ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); - ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b); + ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, id, b); return out; } From 9064b1ca051c6ebff04a3ad9b77d9f8d309396a6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 14:04:54 +0200 Subject: [PATCH 11/47] ggml : fix ggml_get_rows to take into account ne02 / ne11 --- ggml.c | 63 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/ggml.c b/ggml.c index 9982c2dade94e..4bdb702480bc9 100644 --- a/ggml.c +++ b/ggml.c @@ -10342,20 +10342,27 @@ static void ggml_compute_forward_get_rows_q( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; - assert( dst->ne[0] == nc); + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(type)); assert(ggml_nrows(dst) == nr); - assert(src0->nb[0] == ggml_type_size(type)); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + // TODO: multi-thread + for (int64_t i = 0; i < nr; ++i) { + const int64_t r = ((int32_t *) src1->data)[i]; + + const int64_t i02 = i/ne10; dequantize_row_q( - (const void *) ((char *) src0->data + r*src0->nb[1]), + (const void *) ((char *) src0->data + i02*nb02 + r*nb01), (float *) ((char *) dst->data + i*dst->nb[1]), nc); } } @@ -10371,19 +10378,25 @@ static void ggml_compute_forward_get_rows_f16( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); - assert( dst->ne[0] == nc); + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(ggml_fp16_t)); assert(ggml_nrows(dst) == nr); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + // TODO: multi-thread + for (int64_t i = 0; i < nr; ++i) { + const int64_t r = ((int32_t *) src1->data)[i]; + + const int64_t i02 = i/ne10; for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i02*nb02 + r*nb01))[j]; + ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); } } } @@ -10399,19 +10412,25 @@ static void ggml_compute_forward_get_rows_f32( return; } - const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); - assert( dst->ne[0] == nc); + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(float)); assert(ggml_nrows(dst) == nr); - assert(src0->nb[0] == sizeof(float)); - for (int i = 0; i < nr; ++i) { - const int r = ((int32_t *) src1->data)[i]; + // TODO: multi-thread + for (int64_t i = 0; i < nr; ++i) { + const int64_t r = ((int32_t *) src1->data)[i]; + + const int64_t i02 = i/ne10; ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i*dst->nb[1]), - (float *) ((char *) src0->data + r*src0->nb[1])); + (float *) ((char *) src0->data + i02*nb02 + r*nb01)); } } From 2cbcba829f9c7e80a77473c0eadc7d14d3287681 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 14:18:42 +0200 Subject: [PATCH 12/47] metal : add more general support for ggml_get_rows + tests --- ggml-metal.m | 16 +++++----- ggml-metal.metal | 62 ++++++++++++++++++++++++++++++++++---- ggml.c | 6 ++-- tests/test-backend-ops.cpp | 19 ++++++------ 4 files changed, 78 insertions(+), 25 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8389373a85583..28c628958691b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -805,8 +805,9 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: case GGML_OP_CONCAT: case GGML_OP_ADD: case GGML_OP_MUL: @@ -828,7 +829,6 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { case GGML_OP_MUL_MAT_ID: return true; case GGML_OP_DIAG_MASK_INF: - case GGML_OP_GET_ROWS: { return op->ne[0] % 4 == 0; } @@ -1568,16 +1568,18 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false && "not implemented"); } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:7]; const int64_t n = ggml_nelements(src1); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_RMS_NORM: { diff --git a/ggml-metal.metal b/ggml-metal.metal index 2f8ea22d66226..6723200c77724 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3223,14 +3223,16 @@ kernel void kernel_get_rows( device float * dst, constant int64_t & ne00, constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, constant uint64_t & nb1, uint tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], - uint tptg[[threads_per_threadgroup]]) { - const int i = tgpig; - const int r = ((device int32_t *) src1)[i]; + uint tptg [[threads_per_threadgroup]]) { + const int64_t i = tgpig; + const int64_t r = ((device int32_t *) src1)[i]; - for (int ind = tiitg; ind < ne00/16; ind += tptg) { + for (int64_t ind = tiitg; ind < ne00/16; ind += tptg) { float4x4 temp; dequantize_func( ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp); @@ -3238,6 +3240,52 @@ kernel void kernel_get_rows( } } +kernel void kernel_get_rows_f32( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant uint64_t & nb1, + uint tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tptg [[threads_per_threadgroup]]) { + const int64_t i = tgpig; + const int64_t r = ((device int32_t *) src1)[i]; + + const int64_t i02 = i/ne10; + + for (int ind = tiitg; ind < ne00; ind += tptg) { + ((device float *) ((device char *) dst + i*nb1))[ind] = + ((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + } +} + +kernel void kernel_get_rows_f16( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant uint64_t & nb1, + uint tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tptg [[threads_per_threadgroup]]) { + const int64_t i = tgpig; + const int64_t r = ((device int32_t *) src1)[i]; + + const int64_t i02 = i/ne10; + + for (int ind = tiitg; ind < ne00; ind += tptg) { + ((device float *) ((device char *) dst + i*nb1))[ind] = + ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + } +} + #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B #define BLOCK_SIZE_K 32 @@ -3490,11 +3538,13 @@ typedef void (get_rows_t)( device float * dst, constant int64_t & ne00, constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, constant uint64_t & nb1, uint, uint, uint); -template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; -template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; +//template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; +//template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows; diff --git a/ggml.c b/ggml.c index 4bdb702480bc9..5f94ede0067cc 100644 --- a/ggml.c +++ b/ggml.c @@ -10363,7 +10363,7 @@ static void ggml_compute_forward_get_rows_q( dequantize_row_q( (const void *) ((char *) src0->data + i02*nb02 + r*nb01), - (float *) ((char *) dst->data + i*dst->nb[1]), nc); + (float *) ((char *) dst->data + i*nb1), nc); } } @@ -10396,7 +10396,7 @@ static void ggml_compute_forward_get_rows_f16( for (int j = 0; j < nc; ++j) { ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i02*nb02 + r*nb01))[j]; - ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + ((float *) ((char *) dst->data + i*nb1))[j] = GGML_FP16_TO_FP32(v); } } } @@ -10429,7 +10429,7 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i02 = i/ne10; ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i*dst->nb[1]), + (float *) ((char *) dst->data + i*nb1), (float *) ((char *) src0->data + i02*nb02 + r*nb01)); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index dddc2b899138a..c98ca45e0eeeb 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -488,17 +488,18 @@ struct test_get_rows : public test_case { const int n; // cols const int m; // rows const int r; // rows to get + const int b; // batch size std::string vars() override { return VARS_TO_STR4(type, n, m, r); } - test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3) - : type(type), n(n), m(m), r(r) {} + test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1) + : type(type), n(n), m(m), r(r), b(b) {} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * in = ggml_new_tensor_2d(ctx, type, n, m); - ggml_tensor * rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, r); + ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b); + ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b); ggml_tensor * out = ggml_get_rows(ctx, in, rows); return out; } @@ -507,11 +508,11 @@ struct test_get_rows : public test_case { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (t->type == GGML_TYPE_I32) { // rows - std::vector data(r); - for (int i = 0; i < r; i++) { + std::vector data(r*b); + for (int i = 0; i < r*b; i++) { data[i] = rand() % m; } - ggml_backend_tensor_set(t, data.data(), 0, r * sizeof(int)); + ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int)); } else { init_tensor_uniform(t); } @@ -1125,8 +1126,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { - test_cases.emplace_back(new test_get_rows(type, 10, 5, 3)); - test_cases.emplace_back(new test_get_rows(type, 16, 5, 3)); + test_cases.emplace_back(new test_get_rows(type, 10, 5, 3, 7)); + test_cases.emplace_back(new test_get_rows(type, 16, 5, 3, 7)); } test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); From 06dfde3e946d45178b7b242adf9621058b0e3439 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 13:21:09 +0100 Subject: [PATCH 13/47] llama : add basic support for offloading moe with CUDA --- ggml-cuda.cu | 33 ++++++++++++++++++++++++--------- ggml.c | 1 - llama.cpp | 46 +++++++++++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 04a5d2078941b..ba771870e41ae 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8242,15 +8242,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s // TODO: mmq/mmv support #endif - const struct ggml_tensor * ids = src0; - const int32_t id = dst->op_params[0]; - const int32_t n_as = dst->op_params[1]; + GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + const struct ggml_tensor * ids = src0; + const int32_t id = ((int32_t *) dst->op_params)[0]; + const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + if (ids->backend == GGML_BACKEND_GPU) { + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + } else { + memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); + } const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; @@ -8264,7 +8270,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.ne[1] = 1; dst_row.ne[1] = 1; - src1_row.extra = &src1_row_extra; + if (src1->backend == GGML_BACKEND_GPU) { + src1_row.extra = &src1_row_extra; + } dst_row.extra = &dst_row_extra; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -8278,7 +8286,12 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; + if (src1->backend == GGML_BACKEND_GPU) { + src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; + } else { + src1_row.data = (char *) src1->data + i01*src1->nb[1]; + } + dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); @@ -8694,7 +8707,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_repeat; break; case GGML_OP_GET_ROWS: - func = ggml_cuda_get_rows; + if (ggml_is_contiguous(tensor->src[1])) { + func = ggml_cuda_get_rows; + } break; case GGML_OP_DUP: func = ggml_cuda_dup; diff --git a/ggml.c b/ggml.c index 5f94ede0067cc..07d23f4275b54 100644 --- a/ggml.c +++ b/ggml.c @@ -4105,7 +4105,6 @@ struct ggml_tensor * ggml_mul_mat_id( result->src[0] = ids; result->src[1] = b; - // TODO: n_as is the selected experts, but it should be the total number of experts for (int i = 0; i < n_as; i++) { struct ggml_tensor * a = as[i]; GGML_ASSERT(ggml_are_same_shape(as[0], a)); diff --git a/llama.cpp b/llama.cpp index 3b2a6797971dd..c14aab71f308a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4247,16 +4247,25 @@ struct llm_build_context { const int n_experts_per_tok = 2; ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] + cb(logits, "ffn_moe_logits", il); + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + cb(probs, "ffn_moe_probs", il); // select experts ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] - ggml_tensor * weights = - ggml_reshape_2d(ctx0, - ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts), + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); + cb(weights, "ffn_moe_weights", il); + + weights = ggml_reshape_2d(ctx0, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] - weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + cb(weights, "ffn_moe_weights_norm", il); // compute expert outputs ggml_tensor * moe_out; @@ -4269,19 +4278,30 @@ struct llm_build_context { ggml_tensor ** ffn_gate_exp = (ggml_tensor **) model.layers[il].ffn_gate_exp; ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; - cur_expert = ggml_mul(ctx0, - ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur), - ggml_silu(ctx0, - ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur))); // [n_tokens, n_embd] + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur); + cb(cur_up, "ffn_up", il); + + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur); + cb(cur_gate, "ffn_gate", il); + + cur_gate = ggml_silu(ctx0, cur_gate); + cb(cur_gate, "ffn_silu", il); + + cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] + cb(cur_expert, "ffn_gate_par", il); cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cb(cur_expert, "ffn_down", il); + cur_expert = ggml_mul(ctx0, cur_expert, ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + cb(cur_expert, "ffn_moe_weighted", il); if (i == 0) { moe_out = cur_expert; } else { moe_out = ggml_add(ctx0, moe_out, cur_expert); + cb(moe_out, "ffn_moe_out", il); } } @@ -5540,6 +5560,14 @@ static const std::unordered_map k_offload_map { "ffn_relu", OFFLOAD_FUNC }, { "ffn_sqr(relu)", OFFLOAD_FUNC }, + { "ffn_moe_logits", OFFLOAD_FUNC }, + { "ffn_moe_probs", OFFLOAD_FUNC }, + { "ffn_moe_weights", OFFLOAD_FUNC_NOP }, + { "ffn_moe_weights_sum", OFFLOAD_FUNC }, + { "ffn_moe_weights_norm", OFFLOAD_FUNC }, + { "ffn_moe_weighted", OFFLOAD_FUNC }, + { "ffn_moe_out", OFFLOAD_FUNC }, + { "l_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, From 7e2006b0c08e1a47d9032693b32676ee7527030d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 14:24:58 +0200 Subject: [PATCH 14/47] metal : add/mul/div use general kernel when src1 not cont --- ggml-metal.m | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 28c628958691b..00dc4b0e1813a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1003,14 +1003,13 @@ void ggml_metal_graph_compute( case GGML_OP_MUL: case GGML_OP_DIV: { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - bool bcast_row = false; int64_t nb = ne00; - if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) { + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0) { + GGML_ASSERT(ggml_is_contiguous(src0)); + // src1 is a row GGML_ASSERT(ne11 == 1); From 8c5b66eeaae396878efc86a605a5b15063ea5d69 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Dec 2023 15:30:34 +0200 Subject: [PATCH 15/47] metal : reduce the kernel launches for ggml_mul_mat_id --- ggml-metal.m | 50 +++++++++++++++++++++++++++++------------------- ggml-metal.metal | 26 ++++++++++++++++++------- 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 00dc4b0e1813a..a84b88dda782c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1495,6 +1495,9 @@ void ggml_metal_graph_compute( const int idx = ((int32_t *) dst->op_params)[0]; + // batch size + GGML_ASSERT(ne01 == ne11); + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && @@ -1515,19 +1518,25 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory - [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:3]; - [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:4]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:5]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:6]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:13]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:14]; - [encoder setBytes:&idx length:sizeof(idx) atIndex:15]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; // TODO: how to make this an array? read Metal docs for (int j = 0; j < n_as; ++j) { struct ggml_tensor * src_cur = dst->src[2 + j]; @@ -1535,18 +1544,19 @@ void ggml_metal_graph_compute( size_t offs_src_cur = 0; id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); - [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j]; + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; } [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) { - [encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2]; + [encoder dispatchThreadgroups:MTLSizeMake( (1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + //[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + //for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) { + // [encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0]; + // [encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1]; + // [encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } + //} } } break; case GGML_OP_GET_ROWS: diff --git a/ggml-metal.metal b/ggml-metal.metal index 6723200c77724..f25e813c20caf 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3474,19 +3474,22 @@ kernel void kernel_mul_mm(device const uchar * src0, template kernel void kernel_mul_mm_id( - device const int32_t * ids, + device const uchar * ids, device const uchar * src1, - device float * dst, + device uchar * dst, + constant int64_t & nbi1, constant int64_t & ne00, constant int64_t & ne02, constant int64_t & nb01, constant int64_t & nb02, constant int64_t & ne12, + constant int64_t & ne13, constant int64_t & nb10, constant int64_t & nb11, constant int64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant int64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -3504,10 +3507,16 @@ kernel void kernel_mul_mm_id( uint sgitg[[simdgroup_index_in_threadgroup]]) { device const uchar * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + kernel_mul_mm_impl( - src0[ids[idx]], - src1, - dst, + src0[id], + src1 + bid*nb11, + (device float *) (dst + bid*nb1), ne00, ne02, nb01, @@ -3589,19 +3598,22 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; typedef void (mat_mm_id_t)( - device const int32_t * ids, + device const uchar * ids, device const uchar * src1, - device float * dst, + device uchar * dst, + constant int64_t & nbi1, constant int64_t & ne00, constant int64_t & ne02, constant int64_t & nb01, constant int64_t & nb02, constant int64_t & ne12, + constant int64_t & ne13, constant int64_t & nb10, constant int64_t & nb11, constant int64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, + constant int64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, From ac3f7d8e23f1b4785de6e9d2c40d499d2ca94518 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 19:19:03 +0100 Subject: [PATCH 16/47] ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D --- ggml.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index 07d23f4275b54..fb6ba1fc3bf83 100644 --- a/ggml.c +++ b/ggml.c @@ -4734,7 +4734,8 @@ struct ggml_tensor * ggml_get_rows( struct ggml_tensor * a, struct ggml_tensor * b) { GGML_ASSERT(a->ne[2] == b->ne[1]); - GGML_ASSERT(ggml_is_matrix(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(b->ne[3] == 1); + GGML_ASSERT(b->type == GGML_TYPE_I32); bool is_node = false; @@ -4744,7 +4745,7 @@ struct ggml_tensor * ggml_get_rows( // TODO: implement non F32 return //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1]); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -10414,7 +10415,6 @@ static void ggml_compute_forward_get_rows_f32( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); assert(ne0 == nc); assert(ne02 == ne11); @@ -10422,14 +10422,17 @@ static void ggml_compute_forward_get_rows_f32( assert(ggml_nrows(dst) == nr); // TODO: multi-thread - for (int64_t i = 0; i < nr; ++i) { - const int64_t r = ((int32_t *) src1->data)[i]; - - const int64_t i02 = i/ne10; - - ggml_vec_cpy_f32(nc, - (float *) ((char *) dst->data + i*nb1), - (float *) ((char *) src0->data + i02*nb02 + r*nb01)); + // TODO: same impl for get_rows_q and get_rows_f16 + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), + (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); + } + } } } From 2e4db48291760fa87c343e006d555bf4b42e965e Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 22:38:22 +0100 Subject: [PATCH 17/47] ggml : update get_rows f16 and q --- Makefile | 5 +++++ ggml.c | 39 +++++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index a1a6cae5474fd..1b141a626fb3c 100644 --- a/Makefile +++ b/Makefile @@ -396,6 +396,11 @@ ifdef LLAMA_CUBLAS MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math + +ifdef LLAMA_DEBUG + NVCCFLAGS += -lineinfo +endif + ifdef LLAMA_CUDA_NVCC NVCC = $(LLAMA_CUDA_NVCC) else diff --git a/ggml.c b/ggml.c index fb6ba1fc3bf83..696d8d26ebd99 100644 --- a/ggml.c +++ b/ggml.c @@ -4086,7 +4086,7 @@ struct ggml_tensor * ggml_mul_mat_id( GGML_ASSERT(ids->ne[1] == b->ne[1]); GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); - GGML_ASSERT(id >= 0 && id < n_as); + GGML_ASSERT(id >= 0 && id < ids->ne[0]); bool is_node = false; @@ -10345,7 +10345,7 @@ static void ggml_compute_forward_get_rows_q( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; @@ -10356,14 +10356,16 @@ static void ggml_compute_forward_get_rows_q( assert(ggml_nrows(dst) == nr); // TODO: multi-thread - for (int64_t i = 0; i < nr; ++i) { - const int64_t r = ((int32_t *) src1->data)[i]; - - const int64_t i02 = i/ne10; + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - dequantize_row_q( - (const void *) ((char *) src0->data + i02*nb02 + r*nb01), - (float *) ((char *) dst->data + i*nb1), nc); + dequantize_row_q( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } + } } } @@ -10381,7 +10383,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; - const int64_t nr = ggml_nelements(src1); + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); assert(ne0 == nc); assert(ne02 == ne11); @@ -10389,14 +10391,15 @@ static void ggml_compute_forward_get_rows_f16( assert(ggml_nrows(dst) == nr); // TODO: multi-thread - for (int64_t i = 0; i < nr; ++i) { - const int64_t r = ((int32_t *) src1->data)[i]; - - const int64_t i02 = i/ne10; + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i02*nb02 + r*nb01))[j]; - ((float *) ((char *) dst->data + i*nb1))[j] = GGML_FP16_TO_FP32(v); + ggml_fp16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } } } } @@ -10415,6 +10418,7 @@ static void ggml_compute_forward_get_rows_f32( GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); assert(ne0 == nc); assert(ne02 == ne11); @@ -10422,7 +10426,6 @@ static void ggml_compute_forward_get_rows_f32( assert(ggml_nrows(dst) == nr); // TODO: multi-thread - // TODO: same impl for get_rows_q and get_rows_f16 for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { for (int64_t i10 = 0; i10 < ne10; ++i10) { From 62b95f93d018f610711b496715a57cb67b59ce06 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 22:39:34 +0100 Subject: [PATCH 18/47] cuda : support non-contiguous src1 in get_rows --- ggml-cuda.cu | 134 ++++++++++++++++++++++++------------- llama.cpp | 26 ++++--- tests/test-backend-ops.cpp | 61 ++++++++++------- 3 files changed, 142 insertions(+), 79 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ba771870e41ae..f890d228976f9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1686,31 +1686,39 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest } template -static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) { - const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2; - const int row = blockDim.y*blockIdx.y + threadIdx.y; - - if (col >= ncols) { +static __global__ void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { return; } - const int r = y[row]; + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - // copy x[r*ncols + col] to dst[row*ncols + col] - const int xi = r*ncols + col; - const int di = row*ncols + col; + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - const int ib = xi/qk; // block index - const int iqs = (xi%qk)/qr; // quant index - const int iybs = di - di%qk; // y block start index + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index const int y_offset = qr == 1 ? 1 : qk/2; // dequantize dfloat2 v; - dequantize_kernel(x, ib, iqs, v); + dequantize_kernel(src0_row, ib, iqs, v); - dst[iybs + iqs + 0] = v.x; - dst[iybs + iqs + y_offset] = v.y; + dst_row[iybs + iqs + 0] = v.x; + dst_row[iybs + iqs + y_offset] = v.y; } template @@ -5055,11 +5063,35 @@ static __global__ void im2col_f32_f16( } template -static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { +static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(block_num_x, nrows, 1); - k_get_rows<<>>(x, y, dst, ncols); + const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + k_get_rows<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + (void) dst; } template @@ -5071,7 +5103,6 @@ struct bin_bcast_cuda { GGML_TENSOR_BINARY_OP_LOCALS - int nr0 = ne10/ne0; int nr1 = ne11/ne1; int nr2 = ne12/ne2; @@ -5119,26 +5150,28 @@ struct bin_bcast_cuda { int64_t ne12 = cne1[2]; int64_t ne13 = cne1[3]; - //size_t nb0 = cnb0[0]; + size_t nb0 = cnb0[0]; size_t nb1 = cnb0[1]; size_t nb2 = cnb0[2]; size_t nb3 = cnb0[3]; - //size_t nb10 = cnb1[0]; + size_t nb10 = cnb1[0]; size_t nb11 = cnb1[1]; size_t nb12 = cnb1[2]; size_t nb13 = cnb1[3]; - //size_t s0 = nb0 / sizeof(src1_t); + size_t s0 = nb0 / sizeof(src1_t); size_t s1 = nb1 / sizeof(src1_t); size_t s2 = nb2 / sizeof(src1_t); size_t s3 = nb3 / sizeof(src1_t); - //size_t s10 = nb10 / sizeof(src1_t); + size_t s10 = nb10 / sizeof(src1_t); size_t s11 = nb11 / sizeof(src1_t); size_t s12 = nb12 / sizeof(src1_t); size_t s13 = nb13 / sizeof(src1_t); + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); const int block_size = 128; @@ -6449,36 +6482,34 @@ static void ggml_cuda_op_get_rows( GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(dst)); - const int ncols = src0->ne[0]; - const int nrows = ggml_nelements(src1); + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); const int32_t * src1_i32 = (const int32_t *) src1_d; switch (src0->type) { case GGML_TYPE_F16: - get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda<1, 1, convert_f16>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_F32: - get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda<1, 1, convert_f32>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q4_0: - get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q4_1: - get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q5_0: - get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q5_1: - get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q8_0: - get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; default: // TODO: k-quants @@ -8286,11 +8317,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - if (src1->backend == GGML_BACKEND_GPU) { - src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; - } else { - src1_row.data = (char *) src1->data + i01*src1->nb[1]; - } + src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; @@ -8707,9 +8735,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_repeat; break; case GGML_OP_GET_ROWS: - if (ggml_is_contiguous(tensor->src[1])) { - func = ggml_cuda_get_rows; - } + func = ggml_cuda_get_rows; break; case GGML_OP_DUP: func = ggml_cuda_dup; @@ -9215,6 +9241,21 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten } return true; } break; + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -9222,7 +9263,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten case GGML_OP_TRANSPOSE: case GGML_OP_NORM: case GGML_OP_REPEAT: - case GGML_OP_GET_ROWS: case GGML_OP_DUP: case GGML_OP_ADD: case GGML_OP_MUL: @@ -9298,7 +9338,9 @@ static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * use UNUSED(params); } -extern "C" int ggml_backend_cuda_reg_devices() { +extern "C" int ggml_backend_cuda_reg_devices(); + +int ggml_backend_cuda_reg_devices() { int device_count = ggml_cuda_get_device_count(); //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization for (int i = 0; i < device_count; i++) { diff --git a/llama.cpp b/llama.cpp index c14aab71f308a..ee6e28226d52d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4254,12 +4254,13 @@ struct llm_build_context { // select experts ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); + ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); cb(weights, "ffn_moe_weights", il); - weights = ggml_reshape_2d(ctx0, weights, - n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] + weights = ggml_reshape_2d(ctx0, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); cb(weights_sum, "ffn_moe_weights_sum", il); @@ -4268,7 +4269,7 @@ struct llm_build_context { cb(weights, "ffn_moe_weights_norm", il); // compute expert outputs - ggml_tensor * moe_out; + ggml_tensor * moe_out = nullptr; for (int i = 0; i < n_experts_per_tok; ++i) { ggml_tensor * cur_expert; @@ -4279,19 +4280,19 @@ struct llm_build_context { ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur); - cb(cur_up, "ffn_up", il); + cb(cur_up, "ffn_moe_up", il); ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur); - cb(cur_gate, "ffn_gate", il); + cb(cur_gate, "ffn_moe_gate", il); cur_gate = ggml_silu(ctx0, cur_gate); - cb(cur_gate, "ffn_silu", il); + cb(cur_gate, "ffn_moe_silu", il); cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] - cb(cur_expert, "ffn_gate_par", il); + cb(cur_expert, "ffn_moe_gate_par", il); cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_down", il); + cb(cur_expert, "ffn_moe_down", il); cur_expert = ggml_mul(ctx0, cur_expert, ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); @@ -5562,10 +5563,15 @@ static const std::unordered_map k_offload_map { "ffn_moe_logits", OFFLOAD_FUNC }, { "ffn_moe_probs", OFFLOAD_FUNC }, - { "ffn_moe_weights", OFFLOAD_FUNC_NOP }, + { "ffn_moe_argsort", OFFLOAD_FUNC }, + { "ffn_moe_weights", OFFLOAD_FUNC }, { "ffn_moe_weights_sum", OFFLOAD_FUNC }, { "ffn_moe_weights_norm", OFFLOAD_FUNC }, { "ffn_moe_weighted", OFFLOAD_FUNC }, + { "ffn_moe_up", OFFLOAD_FUNC }, + { "ffn_moe_gate", OFFLOAD_FUNC }, + { "ffn_moe_gate_par", OFFLOAD_FUNC }, + { "ffn_moe_down", OFFLOAD_FUNC }, { "ffn_moe_out", OFFLOAD_FUNC }, { "l_out", OFFLOAD_FUNC }, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c98ca45e0eeeb..7b958eb89ebf4 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -489,17 +489,21 @@ struct test_get_rows : public test_case { const int m; // rows const int r; // rows to get const int b; // batch size + const bool v; // view (non-contiguous src1) std::string vars() override { - return VARS_TO_STR4(type, n, m, r); + return VARS_TO_STR6(type, n, m, r, b, v); } - test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1) - : type(type), n(n), m(m), r(r), b(b) {} + test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false) + : type(type), n(n), m(m), r(r), b(b), v(v) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b); ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b); + if (v) { + rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0); + } ggml_tensor * out = ggml_get_rows(ctx, in, rows); return out; } @@ -507,6 +511,7 @@ struct test_get_rows : public test_case { void initialize_tensors(ggml_context * ctx) override { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (t->type == GGML_TYPE_I32) { + if (ggml_is_view_op(t->op)) { continue; } // rows std::vector data(r*b); for (int i = 0; i < r*b; i++) { @@ -773,9 +778,10 @@ struct test_mul_mat_id : public test_case { const int64_t m; const int64_t n; const int64_t k; + const bool v; // view (non-contiguous ids) std::string vars() override { - return VARS_TO_STR7(type_a, type_b, n_mats, id, m, n, k); + return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v); } double max_nmse_err() override { @@ -793,9 +799,9 @@ struct test_mul_mat_id : public test_case { test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, int n_mats = 2, int id = 0, - int64_t m = 32, int64_t n = 32, int64_t k = 32) + int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false) : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id), - m(m), n(n), k(k) {} + m(m), n(n), k(k), v(v) {} ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) @@ -805,8 +811,11 @@ struct test_mul_mat_id : public test_case { mats.push_back(a); } ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); + if (v) { + ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0); + } ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); - ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, id, b); + ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b); return out; } @@ -815,11 +824,12 @@ struct test_mul_mat_id : public test_case { std::default_random_engine rng(rd()); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (t->type == GGML_TYPE_I32) { + if (ggml_is_view_op(t->op)) { continue; } // ids for (int64_t r = 0; r < ggml_nrows(t); r++) { std::vector data(t->ne[0]); for (int i = 0; i < t->ne[0]; i++) { - data[i] = i; + data[i] = i % n_mats; } std::shuffle(data.begin(), data.end(), rng); ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); @@ -1120,14 +1130,27 @@ enum test_mode { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { std::vector> test_cases; + const ggml_type all_types[] = { + GGML_TYPE_F32, GGML_TYPE_F16, + GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, + GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, + GGML_TYPE_Q8_0, + GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, + GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K + }; + // unary ops for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { test_cases.emplace_back(new test_unary((ggml_unary_op) op)); } - for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { - test_cases.emplace_back(new test_get_rows(type, 10, 5, 3, 7)); - test_cases.emplace_back(new test_get_rows(type, 16, 5, 3, 7)); + for (ggml_type type : all_types) { + for (int b : {1, 7}) { + for (bool v : {false, true}) { + test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v)); + } + } } test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); @@ -1183,16 +1206,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); } - const ggml_type all_types[] = { - GGML_TYPE_F32, GGML_TYPE_F16, - GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, - GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, - GGML_TYPE_Q8_0, - GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, - GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K - }; - for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { // FIXME: CPU crashes on f16xf16 @@ -1216,9 +1229,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { - for (int n_mats : {1, 2, 4}) { + for (int n_mats : {2, 4, 8}) { for (int id = 0; id < n_mats; id++) { - test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256)); + for (bool v : {false, true}) { + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v)); + } } } } From 0710b0f726f6f42ac6abf7cdcf8dd78b3260e222 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Dec 2023 23:29:47 +0100 Subject: [PATCH 19/47] llama : offload missing ffn_moe_silu --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index ee6e28226d52d..4ac46193caf7c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5570,6 +5570,7 @@ static const std::unordered_map k_offload_map { "ffn_moe_weighted", OFFLOAD_FUNC }, { "ffn_moe_up", OFFLOAD_FUNC }, { "ffn_moe_gate", OFFLOAD_FUNC }, + { "ffn_moe_silu", OFFLOAD_FUNC }, { "ffn_moe_gate_par", OFFLOAD_FUNC }, { "ffn_moe_down", OFFLOAD_FUNC }, { "ffn_moe_out", OFFLOAD_FUNC }, From 016f9bb55a215b0ad5494c4fb43d86a9a6634afa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 09:38:21 +0200 Subject: [PATCH 20/47] metal : fix ggml_get_rows to work with non-cont src1 --- ggml-metal.m | 9 +++--- ggml-metal.metal | 75 +++++++++++++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index a84b88dda782c..58d6e567f50a2 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1584,11 +1584,12 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5]; [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:10]; - const int64_t n = ggml_nelements(src1); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_RMS_NORM: { diff --git a/ggml-metal.metal b/ggml-metal.metal index f25e813c20caf..6026316e5060f 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3219,69 +3219,89 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg template kernel void kernel_get_rows( device const void * src0, - device const int * src1, + device const char * src1, device float * dst, constant int64_t & ne00, constant uint64_t & nb01, constant uint64_t & nb02, constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, constant uint64_t & nb1, - uint tgpig[[threadgroup_position_in_grid]], + constant uint64_t & nb2, + uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], - uint tptg [[threads_per_threadgroup]]) { - const int64_t i = tgpig; - const int64_t r = ((device int32_t *) src1)[i]; + uint3 tptg [[threads_per_threadgroup]]) { + //const int64_t i = tgpig; + //const int64_t r = ((device int32_t *) src1)[i]; + + const int64_t i10 = tgpig.x; + const int64_t i11 = tgpig.y; - for (int64_t ind = tiitg; ind < ne00/16; ind += tptg) { + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; + + const int64_t i02 = i11; + + for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) { float4x4 temp; dequantize_func( - ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp); - *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp; + ((device const block_q *) ((device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp); + *(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp; } } kernel void kernel_get_rows_f32( device const void * src0, - device const int * src1, + device const char * src1, device float * dst, constant int64_t & ne00, constant uint64_t & nb01, constant uint64_t & nb02, constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, constant uint64_t & nb1, - uint tgpig[[threadgroup_position_in_grid]], + constant uint64_t & nb2, + uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], - uint tptg [[threads_per_threadgroup]]) { - const int64_t i = tgpig; - const int64_t r = ((device int32_t *) src1)[i]; + uint3 tptg [[threads_per_threadgroup]]) { + const int64_t i10 = tgpig.x; + const int64_t i11 = tgpig.y; + + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; - const int64_t i02 = i/ne10; + const int64_t i02 = i11; - for (int ind = tiitg; ind < ne00; ind += tptg) { - ((device float *) ((device char *) dst + i*nb1))[ind] = + for (int ind = tiitg; ind < ne00; ind += tptg.x) { + ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = ((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; } } kernel void kernel_get_rows_f16( device const void * src0, - device const int * src1, + device const char * src1, device float * dst, constant int64_t & ne00, constant uint64_t & nb01, constant uint64_t & nb02, constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, constant uint64_t & nb1, - uint tgpig[[threadgroup_position_in_grid]], + constant uint64_t & nb2, + uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], - uint tptg [[threads_per_threadgroup]]) { - const int64_t i = tgpig; - const int64_t r = ((device int32_t *) src1)[i]; + uint3 tptg [[threads_per_threadgroup]]) { + const int64_t i10 = tgpig.x; + const int64_t i11 = tgpig.y; - const int64_t i02 = i/ne10; + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; - for (int ind = tiitg; ind < ne00; ind += tptg) { - ((device float *) ((device char *) dst + i*nb1))[ind] = + const int64_t i02 = i11; + + for (int ind = tiitg; ind < ne00; ind += tptg.x) { + ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; } } @@ -3543,14 +3563,17 @@ kernel void kernel_mul_mm_id( typedef void (get_rows_t)( device const void * src0, - device const int * src1, + device const char * src1, device float * dst, constant int64_t & ne00, constant uint64_t & nb01, constant uint64_t & nb02, constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, constant uint64_t & nb1, - uint, uint, uint); + constant uint64_t & nb2, + uint3, uint, uint3); //template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; //template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; From 6cfb31f9ea6daff306ac45bfffae046619e76f86 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 10:59:13 +0200 Subject: [PATCH 21/47] metal : add indirect mat-vec kernels for all quantization types --- ggml-metal.m | 210 ++++++++- ggml-metal.metal | 1127 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 1255 insertions(+), 82 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 58d6e567f50a2..75929bc460e0d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -102,6 +102,21 @@ GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32); + //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16); + GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32); + //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row); + //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4); + GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); @@ -354,6 +369,21 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32); + //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16); + GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32); + //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row); + //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4); + GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); @@ -454,6 +484,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32); + //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16); + GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32); + //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row); + //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4); + GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); @@ -1491,17 +1536,22 @@ void ggml_metal_graph_compute( // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = 0; + int ne11_mm_min = 1; const int idx = ((int32_t *) dst->op_params)[0]; // batch size GGML_ASSERT(ne01 == ne11); + const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne11 > ne11_mm_min) { + // !!! + // TODO: for now, always use mat-vec kernels until we figure out how to improve the + // indirect matrix multiplication + // !!! + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) { switch (src2->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; @@ -1517,7 +1567,6 @@ void ggml_metal_graph_compute( case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } - const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1549,14 +1598,153 @@ void ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - //[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - //for (int64_t i01 = 0; i01 < src0->ne[1]; i01++) { - // [encoder setBuffer:id_src0 offset:offs_src0 + i01*nb01 atIndex:0]; - // [encoder setBuffer:id_src1 offset:offs_src1 + i01*nb11 atIndex:1]; - // [encoder setBuffer:id_dst offset:offs_dst + i01*nb1 atIndex:2]; + // TODO: processing one row at a time (ne11 -> 1) is not efficient + [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + + // use custom matrix x vector kernel + switch (src2t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32]; + nrows = 4; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + nth0 = 32; + nth1 = 1; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32]; + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32]; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32]; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32]; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32]; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32]; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32]; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32]; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32]; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32]; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32]; + } break; + default: + { + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_ASSERT(false && "not implemented"); + } + }; + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; + [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6]; + [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8]; + [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; + [encoder setBytes:&r2 length:sizeof(r2) atIndex:20]; + [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; + [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; + // TODO: how to make this an array? read Metal docs + for (int j = 0; j < n_as; ++j) { + struct ggml_tensor * src_cur = dst->src[2 + j]; + + size_t offs_src_cur = 0; + id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); + + [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; + } - //} + if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || + src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || + src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q3_K) { +#ifdef GGML_QKK_64 + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#else + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; +#endif + } + else if (src2t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src2t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (_ne1 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } } } break; case GGML_OP_GET_ROWS: diff --git a/ggml-metal.metal b/ggml-metal.metal index 6026316e5060f..067c5779d757b 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -731,7 +731,7 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre // giard against the number of rows not being divisible by // N_DST, so this is another explicit assumption of the implementation. template -void mul_vec_q_n_f32( +void mul_vec_q_n_f32_impl( device const void * src0, device const float * src1, device float * dst, @@ -813,7 +813,7 @@ kernel void kernel_mul_mv_q4_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q4_1_f32( @@ -832,7 +832,7 @@ kernel void kernel_mul_mv_q4_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_0_f32( @@ -851,7 +851,7 @@ kernel void kernel_mul_mv_q5_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_1_f32( @@ -870,28 +870,28 @@ kernel void kernel_mul_mv_q5_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); } #define NB_Q8_0 8 -kernel void kernel_mul_mv_q8_0_f32( +void kernel_mul_mv_q8_0_f32_impl( device const void * src0, device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nr = N_DST; const int nsg = N_SIMDGROUP; const int nw = N_SIMDWIDTH; @@ -945,9 +945,29 @@ kernel void kernel_mul_mv_q8_0_f32( } } +[[host_name("kernel_mul_mv_q8_0_f32")]] +kernel void kernel_mul_mv_q8_0_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); +} + #define N_F32_F32 4 -kernel void kernel_mul_mv_f32_f32( +void kernel_mul_mv_f32_f32_impl( device const char * src0, device const char * src1, device float * dst, @@ -965,8 +985,8 @@ kernel void kernel_mul_mv_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1025,6 +1045,32 @@ kernel void kernel_mul_mv_f32_f32( } } +[[host_name("kernel_mul_mv_f32_f32")]] +kernel void kernel_mul_mv_f32_f32( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); +} + #define N_F16_F16 4 kernel void kernel_mul_mv_f16_f16( @@ -1105,7 +1151,7 @@ kernel void kernel_mul_mv_f16_f16( } } -kernel void kernel_mul_mv_f16_f32_1row( +void kernel_mul_mv_f16_f32_1row_impl( device const char * src0, device const char * src1, device float * dst, @@ -1123,8 +1169,8 @@ kernel void kernel_mul_mv_f16_f32_1row( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1161,12 +1207,37 @@ kernel void kernel_mul_mv_f16_f32_1row( dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; } } +} +[[host_name("kernel_mul_mv_f16_f32_1row")]] +kernel void kernel_mul_mv_f16_f32_1row( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); } #define N_F16_F32 4 -kernel void kernel_mul_mv_f16_f32( +void kernel_mul_mv_f16_f32_impl( device const char * src0, device const char * src1, device float * dst, @@ -1184,8 +1255,8 @@ kernel void kernel_mul_mv_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1244,6 +1315,32 @@ kernel void kernel_mul_mv_f16_f32( } } +[[host_name("kernel_mul_mv_f16_f32")]] +kernel void kernel_mul_mv_f16_f32( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); +} + // Assumes row size (ne00) is a multiple of 4 kernel void kernel_mul_mv_f16_f32_l4( device const char * src0, @@ -2064,19 +2161,19 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { //====================================== dot products ========================= -kernel void kernel_mul_mv_q2_K_f32( +void kernel_mul_mv_q2_K_f32_impl( device const void * src0, device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2214,8 +2311,8 @@ kernel void kernel_mul_mv_q2_K_f32( } } -#if QK_K == 256 -kernel void kernel_mul_mv_q3_K_f32( +[[host_name("kernel_mul_mv_q2_K_f32")]] +kernel void kernel_mul_mv_q2_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -2229,8 +2326,29 @@ kernel void kernel_mul_mv_q3_K_f32( constant uint & r2 [[buffer(17)]], constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + +#if QK_K == 256 +void kernel_mul_mv_q3_K_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; @@ -2373,19 +2491,19 @@ kernel void kernel_mul_mv_q3_K_f32( } } #else -kernel void kernel_mul_mv_q3_K_f32( +void kernel_mul_mv_q3_K_f32_impl( device const void * src0, device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2450,20 +2568,41 @@ kernel void kernel_mul_mv_q3_K_f32( } #endif +[[host_name("kernel_mul_mv_q3_K_f32")]] +kernel void kernel_mul_mv_q3_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + #if QK_K == 256 -kernel void kernel_mul_mv_q4_K_f32( +void kernel_mul_mv_q4_K_f32_impl( device const void * src0, device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01 [[buffer(4)]], - constant int64_t & ne02 [[buffer(5)]], - constant int64_t & ne10 [[buffer(9)]], - constant int64_t & ne12 [[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2564,19 +2703,19 @@ kernel void kernel_mul_mv_q4_K_f32( } } #else -kernel void kernel_mul_mv_q4_K_f32( +void kernel_mul_mv_q4_K_f32_impl( device const void * src0, device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2660,7 +2799,8 @@ kernel void kernel_mul_mv_q4_K_f32( } #endif -kernel void kernel_mul_mv_q5_K_f32( +[[host_name("kernel_mul_mv_q4_K_f32")]] +kernel void kernel_mul_mv_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -2677,6 +2817,26 @@ kernel void kernel_mul_mv_q5_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { + kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + +void kernel_mul_mv_q5_K_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; @@ -2836,10 +2996,10 @@ kernel void kernel_mul_mv_q5_K_f32( dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } - } -kernel void kernel_mul_mv_q6_K_f32( +[[host_name("kernel_mul_mv_q5_K_f32")]] +kernel void kernel_mul_mv_q5_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -2853,8 +3013,28 @@ kernel void kernel_mul_mv_q6_K_f32( constant uint & r2 [[buffer(17)]], constant uint & r3 [[buffer(18)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + +void kernel_mul_mv_q6_K_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const uint8_t kmask1 = 0x03; const uint8_t kmask2 = 0x0C; @@ -2945,7 +3125,28 @@ kernel void kernel_mul_mv_q6_K_f32( } } -//============================= templates and their specializations ============================= +[[host_name("kernel_mul_mv_q6_K_f32")]] +kernel void kernel_mul_mv_q6_K_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & r2 [[buffer(17)]], + constant uint & r3 [[buffer(18)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + +//============================= templates and their specializations ============================= // NOTE: this is not dequantizing - we are simply fitting the template template @@ -3561,6 +3762,10 @@ kernel void kernel_mul_mm_id( #define QK_NL 4 #endif +// +// get rows +// + typedef void (get_rows_t)( device const void * src0, device const char * src1, @@ -3588,6 +3793,10 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; +// +// matrix-matrix multiplication +// + typedef void (mat_mm_t)( device const uchar * src0, device const uchar * src1, @@ -3620,6 +3829,10 @@ template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; +// +// indirect matrix-matrix multiplication +// + typedef void (mat_mm_id_t)( device const uchar * ids, device const uchar * src1, @@ -3663,3 +3876,775 @@ template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; + +// +// matrix-vector multiplication +// + +[[host_name("kernel_mul_mv_id_f32_f32")]] +kernel void kernel_mul_mv_id_f32_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_f32_f32_impl( + src0[id], + src1 + bid*nb11, + (device float *) (dst + bid*nb1), + ne00, + ne01, + ne02, + nb00, + nb01, + nb02, + ne10, + ne11, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg); +} + +[[host_name("kernel_mul_mv_id_f16_f32")]] +kernel void kernel_mul_mv_id_f16_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_f16_f32_impl( + src0[id], + src1 + bid*nb11, + (device float *) (dst + bid*nb1), + ne00, + ne01, + ne02, + nb00, + nb01, + nb02, + ne10, + ne11, + ne12, + nb10, + nb11, + nb12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg); +} + +[[host_name("kernel_mul_mv_id_q8_0_f32")]] +kernel void kernel_mul_mv_id_q8_0_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q8_0_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q4_0_f32")]] +kernel void kernel_mul_mv_id_q4_0_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + mul_vec_q_n_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q4_1_f32")]] +kernel void kernel_mul_mv_id_q4_1_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + mul_vec_q_n_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q5_0_f32")]] +kernel void kernel_mul_mv_id_q5_0_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + mul_vec_q_n_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q5_1_f32")]] +kernel void kernel_mul_mv_id_q5_1_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + mul_vec_q_n_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q2_K_f32")]] +kernel void kernel_mul_mv_id_q2_K_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q2_K_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q3_K_f32")]] +kernel void kernel_mul_mv_id_q3_K_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q3_K_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q4_K_f32")]] +kernel void kernel_mul_mv_id_q4_K_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q4_K_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q5_K_f32")]] +kernel void kernel_mul_mv_id_q5_K_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q5_K_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_q6_K_f32")]] +kernel void kernel_mul_mv_id_q6_K_f32( + device const char * ids, + device const char * src1, + device uchar * dst, + constant int64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_q6_K_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + (device float *) ( dst + bid*nb1), + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} From d1259b7b35f5c29154645344e781a8e894b7a4fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 13:00:13 +0200 Subject: [PATCH 22/47] llama : do not quantize expert gating tensors --- llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama.cpp b/llama.cpp index 4ac46193caf7c..0a5f755ca569d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8443,6 +8443,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; + // do not quantize expert gating tensors + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size; From e640cbe05551650975291589cdf059066afab873 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 13:57:54 +0200 Subject: [PATCH 23/47] llama : add n_expert and n_expert_used to hparams + change quants --- convert.py | 51 ++++++++++++------ ggml.c | 2 +- ggml.h | 2 +- gguf-py/gguf/constants.py | 2 + gguf-py/gguf/gguf_writer.py | 6 +++ llama.cpp | 100 ++++++++++++++++++++++++------------ 6 files changed, 110 insertions(+), 53 deletions(-) diff --git a/convert.py b/convert.py index 7cc59a8437e6c..d36f25946e9cf 100755 --- a/convert.py +++ b/convert.py @@ -151,14 +151,16 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: @dataclass class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - f_norm_eps: float + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None rope_scaling_type: gguf.RopeScalingType | None = None f_rope_freq_base: float | None = None @@ -255,6 +257,9 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) + n_experts = None + n_experts_used = None + # hack to determine LLaMA v1 vs v2 vs CodeLlama if config.get("rope_theta") == 1000000: # CodeLlama @@ -262,20 +267,20 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: elif config["norm_eps"] == 1e-05: # LLaMA v2 n_ctx = 4096 + elif config["moe"]: + # Mixtral + n_ctx = 32768 else: # LLaMA v1 n_ctx = 2048 - # print model keys - for k in model.keys(): - print(k) + if "layers.0.feed_forward.w1.weight" in model: + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] - # check if MoE - if "layers.0.feed_forward.experts.0.w1.weight" in model: + if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_ctx = 32768 - else: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] + n_experts = config["moe"]["num_experts"] + n_experts_used = config["moe"]["num_experts_per_tok"] return Params( n_vocab = model["tok_embeddings.weight"].shape[0], @@ -285,6 +290,8 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_ff = n_ff, n_head = (n_head := config["n_heads"]), n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, f_norm_eps = config["norm_eps"], f_rope_freq_base = config.get("rope_theta"), ) @@ -843,7 +850,17 @@ def add_meta_arch(self, params: Params) -> None: self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_head_count (params.n_head) self.gguf.add_head_count_kv (params.n_head_kv) - self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + + if params.n_experts: + self.gguf.add_expert_count(params.n_experts) + + if params.n_experts_used: + self.gguf.add_expert_used_count(params.n_experts_used) + + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) diff --git a/ggml.c b/ggml.c index 696d8d26ebd99..6f5493096bbc0 100644 --- a/ggml.c +++ b/ggml.c @@ -4075,7 +4075,7 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * as[], + struct ggml_tensor * const as[], int n_as, struct ggml_tensor * ids, int id, diff --git a/ggml.h b/ggml.h index bb09160b91fd6..849a7e7ad604c 100644 --- a/ggml.h +++ b/ggml.h @@ -1051,7 +1051,7 @@ extern "C" { // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * as[], + struct ggml_tensor * const as[], int n_as, struct ggml_tensor * ids, int id, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 59c2d24e21261..12133882be2c4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -38,6 +38,8 @@ class LLM: FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index b8ec977c8f3fa..73e02160750b2 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -339,6 +339,12 @@ def add_max_alibi_bias(self, bias: float) -> None: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) + + def add_expert_used_count(self, count: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) diff --git a/llama.cpp b/llama.cpp index 0a5f755ca569d..e2a01902e8920 100644 --- a/llama.cpp +++ b/llama.cpp @@ -91,7 +91,8 @@ #define LLAMA_ATTRIBUTE_FORMAT(...) #endif -#define LLAMA_MAX_NODES 8192 +#define LLAMA_MAX_NODES 8192 +#define LLAMA_MAX_EXPERTS 8 // // logging @@ -231,6 +232,8 @@ enum llm_kv { LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, + LLM_KV_EXPERT_COUNT, + LLM_KV_EXPERT_USED_COUNT, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -281,6 +284,8 @@ static std::map LLM_KV_NAMES = { { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, + { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1176,6 +1181,8 @@ struct llama_hparams { uint32_t n_layer; uint32_t n_rot; uint32_t n_ff; + uint32_t n_expert = 0; + uint32_t n_expert_used = 0; float f_norm_eps; float f_norm_rms_eps; @@ -1190,15 +1197,18 @@ struct llama_hparams { float f_max_alibi_bias; bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; - if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_head != other.n_head) return true; - if (this->n_head_kv != other.n_head_kv) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_ff != other.n_ff) return true; + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; + if (this->n_ctx_train != other.n_ctx_train) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; + if (this->n_expert != other.n_expert) return true; + if (this->n_expert_used != other.n_expert_used) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; @@ -1282,9 +1292,9 @@ struct llama_layer { // ff MoE struct ggml_tensor * ffn_gate_inp; - struct ggml_tensor * ffn_gate_exp[8]; - struct ggml_tensor * ffn_down_exp[8]; - struct ggml_tensor * ffn_up_exp[8]; + struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS]; // ff bias struct ggml_tensor * ffn_down_b; // b2 @@ -2458,6 +2468,16 @@ static void llm_load_hparams( ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false); + ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + + GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); + GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); + if (hparams.n_expert > 0) { + GGML_ASSERT(hparams.n_expert_used > 0); + } else { + GGML_ASSERT(hparams.n_expert_used == 0); + } // n_head_kv is optional, default to n_head hparams.n_head_kv = hparams.n_head; @@ -2889,6 +2909,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); + LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -3046,10 +3068,16 @@ static void llm_load_tensors( layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); if (layer.ffn_gate_inp == nullptr) { + GGML_ASSERT(hparams.n_expert == 0); + GGML_ASSERT(hparams.n_expert_used == 0); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); } else { + GGML_ASSERT(hparams.n_expert > 0); + GGML_ASSERT(hparams.n_expert_used > 0); + // MoE branch for (int x = 0; x < 8; ++x) { layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); @@ -3073,7 +3101,7 @@ static void llm_load_tensors( ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } else { vram_weights += ggml_nbytes(layer.ffn_gate_inp); - for (int x = 0; x < 8; ++x) { + for (uint32_t x = 0; x < hparams.n_expert; ++x) { vram_weights += ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]); } @@ -4058,6 +4086,8 @@ struct llm_build_context { const int64_t n_head_kv; const int64_t n_embd_head; const int64_t n_embd_gqa; + const int64_t n_expert; + const int64_t n_expert_used; const float freq_base; const float freq_scale; @@ -4099,6 +4129,8 @@ struct llm_build_context { n_head_kv (hparams.n_head_kv), n_embd_head (hparams.n_embd_head()), n_embd_gqa (hparams.n_embd_gqa()), + n_expert (hparams.n_expert), + n_expert_used (hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), @@ -4242,10 +4274,6 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - // TODO: param - const int n_experts = 8; - const int n_experts_per_tok = 2; - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] cb(logits, "ffn_moe_logits", il); @@ -4253,14 +4281,14 @@ struct llm_build_context { cb(probs, "ffn_moe_probs", il); // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] cb(selected_experts->src[0], "ffn_moe_argsort", il); ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_experts, n_tokens), selected_experts); + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); cb(weights, "ffn_moe_weights", il); - weights = ggml_reshape_2d(ctx0, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); cb(weights_sum, "ffn_moe_weights_sum", il); @@ -4271,18 +4299,13 @@ struct llm_build_context { // compute expert outputs ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_experts_per_tok; ++i) { + for (int i = 0; i < n_expert_used; ++i) { ggml_tensor * cur_expert; - // TODO: fix - ggml_tensor ** ffn_up_exp = (ggml_tensor **) model.layers[il].ffn_up_exp; - ggml_tensor ** ffn_gate_exp = (ggml_tensor **) model.layers[il].ffn_gate_exp; - ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers[il].ffn_down_exp; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, ffn_up_exp, n_experts, selected_experts, i, cur); + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); cb(cur_up, "ffn_moe_up", il); - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur); + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); cb(cur_gate, "ffn_moe_gate", il); cur_gate = ggml_silu(ctx0, cur_gate); @@ -4291,7 +4314,7 @@ struct llm_build_context { cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_gate_par", il); - cur_expert = ggml_mul_mat_id(ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_down", il); cur_expert = ggml_mul(ctx0, cur_expert, @@ -8192,11 +8215,9 @@ static void llama_convert_tensor_internal( workers.clear(); } -static ggml_type get_k_quant_type( - quantize_state_internal & qs, - ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype -) { +static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); + // TODO: avoid hardcoded tensor names - use the TN_* constants const llm_arch arch = qs.model.arch; const auto tn = LLM_TN(arch); @@ -8230,7 +8251,18 @@ static ggml_type get_k_quant_type( // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } + if (qs.model.hparams.n_expert == 8) { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + // TODO: explore better strategies + new_type = GGML_TYPE_Q8_0; + } ++qs.i_attention_wv; + } else if (name.find("attn_k.weight") != std::string::npos) { + if (qs.model.hparams.n_expert == 8) { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + // TODO: explore better strategies + new_type = GGML_TYPE_Q8_0; + } } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { From cefebb3660f66458b6c94331c8a872a4211513fb Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 10 Dec 2023 13:11:39 +0100 Subject: [PATCH 24/47] test-backend-ops : add moe test --- tests/test-backend-ops.cpp | 128 +++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7b958eb89ebf4..2f7ea4edfe4c2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -51,7 +51,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } - if (tensor->type == GGML_TYPE_F32) { + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); @@ -233,6 +233,10 @@ static bool ggml_is_view_op(enum ggml_op op) { struct test_case { virtual ~test_case() {} + virtual std::string op_desc(ggml_tensor * t) { + return ggml_op_desc(t); + } + virtual std::string vars() { return ""; } @@ -240,7 +244,7 @@ struct test_case { virtual ggml_tensor * build_graph(ggml_context * ctx) = 0; virtual double max_nmse_err() { - return 1e-6; + return 1e-7; } virtual void initialize_tensors(ggml_context * ctx) { @@ -270,13 +274,13 @@ struct test_case { ggml_tensor * out = build_graph(ctx); - if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) { - //printf(" %s: skipping\n", ggml_op_desc(out)); + if (op_name != nullptr && op_desc(out) != op_name) { + //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; } - printf(" %s(%s): ", ggml_op_desc(out), vars().c_str()); + printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); fflush(stdout); // check if backends support op @@ -317,7 +321,7 @@ struct test_case { for (size_t i = 0; i < f1.size(); i++) { // check for nans if (std::isnan(f1[i]) || std::isnan(f2[i])) { - printf("NaN at index %zu ", i); + printf("[%s] NaN at index %zu ", ggml_op_desc(t1), i); ud->ok = false; return true; } @@ -325,21 +329,32 @@ struct test_case { if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) { if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) { if (std::signbit(f1[i]) != std::signbit(f2[i])) { - printf("inf sign mismatch: %f %f ", f1[i], f2[i]); + printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]); ud->ok = false; return true; } } else { - printf("inf mismatch: %f %f ", f1[i], f2[i]); + printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]); ud->ok = false; return true; } } } + //if (t1->op == GGML_OP_SOFT_MAX) { + // printf("[%s] ", ggml_op_desc(t1)); + // for (int i = 0; i < f1.size(); i++) { + // printf("(%x, %x) ", *(uint32_t*)&f1[i], *(uint32_t*)&f2[i]); + // } + // printf("\n"); + //} double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { - printf("NMSE = %f ", err); + printf("[%s] NMSE = %f ", ggml_op_desc(t1), err); + //for (int i = 0; i < f1.size(); i++) { + // printf("(%f, %f) ", f1[i], f2[i]); + //} + //printf("\n"); ud->ok = false; } return true; @@ -374,13 +389,13 @@ struct test_case { ggml_tensor * out = build_graph(ctx); - if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) { - //printf(" %s: skipping\n", ggml_op_desc(out)); + if (op_name != nullptr && op_desc(out) != op_name) { + //printf(" %s: skipping\n", op_desc(out).c_str()); ggml_free(ctx); return true; } - int len = printf(" %s(%s): ", ggml_op_desc(out), vars().c_str()); + int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); fflush(stdout); // check if backends support op @@ -1122,6 +1137,91 @@ struct test_sum_rows : public test_case { } }; +struct test_moe : public test_case { + const int n_experts = 8; + const int n_experts_per_tok = 2; + const int n_tokens = 1; + const int n_embd = 4096; + const int n_ff = 14336; + + std::string op_desc(ggml_tensor * t) override { + return "MOE"; + GGML_UNUSED(t); + } + + std::string vars() override { + return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff); + } + + test_moe() { + } + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts); + + std::vector ffn_up_exp(n_experts); + std::vector ffn_gate_exp(n_experts); + std::vector ffn_down_exp(n_experts); + + for (int i = 0; i < n_experts; ++i) { + ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + } + + ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + + ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_tokens, num_experts] + ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_tokens, num_experts] + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights = ggml_get_rows(ctx, + ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts); + printf("get rows args %ld %ld %ld %ld, %ld %ld %ld %ld\n", + weights->src[0]->ne[0], weights->src[0]->ne[1], weights->src[0]->ne[2], weights->src[0]->ne[3], + weights->src[1]->ne[0], weights->src[1]->ne[1], weights->src[1]->ne[2], weights->src[1]->ne[3]); + + + weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] + + ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); + + weights = ggml_div(ctx, weights, weights_sum); // [n_tokens, num_experts_per_tok] + + // compute expert outputs + ggml_tensor * moe_out = nullptr; + + for (int i = 0; i < n_experts_per_tok; ++i) { + ggml_tensor * cur_expert; + + ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur); + + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur); + + cur_gate = ggml_silu(ctx, cur_gate); + + cur_expert = ggml_mul(ctx, cur_up, cur_gate); // [n_tokens, n_embd] + + cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] + + cur_expert = ggml_mul(ctx, cur_expert, + ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx, moe_out, cur_expert); + } + } + + cur = moe_out; + + return cur; + } +}; + enum test_mode { MODE_TEST, MODE_PERF, @@ -1140,11 +1240,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K }; + test_cases.emplace_back(new test_moe()); + // unary ops for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { test_cases.emplace_back(new test_unary((ggml_unary_op) op)); } + test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false)); for (ggml_type type : all_types) { for (int b : {1, 7}) { for (bool v : {false, true}) { @@ -1265,6 +1368,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_concat()); for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) { + test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); } From 8614aa736d745699ef5308cb291251ce7278cbc5 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 10 Dec 2023 13:12:11 +0100 Subject: [PATCH 25/47] cuda : fix get_rows when ncols is odd --- ggml-cuda.cu | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f890d228976f9..5a5b4e8ba3434 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1721,6 +1721,32 @@ static __global__ void k_get_rows( dst_row[iybs + iqs + y_offset] = v.y; } +template +static __global__ void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = blockIdx.x*blockDim.x + threadIdx.x; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; @@ -5083,6 +5109,8 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg const size_t s12 = nb12 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1); + GGML_ASSERT(ne00 % 2 == 0); + k_get_rows<<>>( src0_dd, src1_dd, dst_dd, ne00, /*ne01, ne02, ne03,*/ @@ -5094,6 +5122,38 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg (void) dst; } +template +static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + k_get_rows_float<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + (void) dst; +} + template struct bin_bcast_cuda { template @@ -6491,10 +6551,10 @@ static void ggml_cuda_op_get_rows( switch (src0->type) { case GGML_TYPE_F16: - get_rows_cuda<1, 1, convert_f16>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_F32: - get_rows_cuda<1, 1, convert_f32>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); break; case GGML_TYPE_Q4_0: get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); From 65923a8ede3ad5467264039fac8040976c1ca139 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 14:17:46 +0200 Subject: [PATCH 26/47] convert : determine n_ctx correctly --- convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index d36f25946e9cf..532c8b5abb86e 100755 --- a/convert.py +++ b/convert.py @@ -261,15 +261,15 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_experts_used = None # hack to determine LLaMA v1 vs v2 vs CodeLlama - if config.get("rope_theta") == 1000000: + if config.get("moe"): + # Mixtral + n_ctx = 32768 + elif config.get("rope_theta") == 1000000: # CodeLlama n_ctx = 16384 elif config["norm_eps"] == 1e-05: # LLaMA v2 n_ctx = 4096 - elif config["moe"]: - # Mixtral - n_ctx = 32768 else: # LLaMA v1 n_ctx = 2048 From b0b83dd9e2f94aee4d6c32329e08eaca6f68a912 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 14:30:38 +0200 Subject: [PATCH 27/47] metal : fix ggml_mul_mat_id for F32 --- ggml-metal.m | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 75929bc460e0d..cca9244e6fbe7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1612,7 +1612,6 @@ void ggml_metal_graph_compute( { GGML_ASSERT(src1t == GGML_TYPE_F32); [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32]; - nrows = 4; } break; case GGML_TYPE_F16: { From 54ba2634108787f71016e8329b93778957f801f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Dec 2023 15:27:41 +0200 Subject: [PATCH 28/47] test-backend-ops : make experts more evenly probable (test_moe) --- tests/test-backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2f7ea4edfe4c2..539f7f71b3bda 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1172,7 +1172,7 @@ struct test_moe : public test_case { ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_tokens, num_experts] - ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_tokens, num_experts] + ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd)); // [n_tokens, num_experts] // select experts ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] From 54d254bbed5c95338737c963984a7b1ff830e617 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 10 Dec 2023 21:52:11 +0100 Subject: [PATCH 29/47] test-backend-ops : cleanup, add moe test for batches --- tests/test-backend-ops.cpp | 67 ++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 539f7f71b3bda..138fd298271c7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -20,8 +20,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m size_t size = ggml_nelements(tensor); std::vector data(size); - std::random_device rd; - #if 0 std::default_random_engine generator(rd()); std::uniform_real_distribution distribution(min, max); @@ -31,6 +29,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } #endif auto init_thread = [&](size_t start, size_t end) { + std::random_device rd; std::default_random_engine generator(rd()); std::uniform_real_distribution distribution(min, max); @@ -341,13 +340,6 @@ struct test_case { } } - //if (t1->op == GGML_OP_SOFT_MAX) { - // printf("[%s] ", ggml_op_desc(t1)); - // for (int i = 0; i < f1.size(); i++) { - // printf("(%x, %x) ", *(uint32_t*)&f1[i], *(uint32_t*)&f2[i]); - // } - // printf("\n"); - //} double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { printf("[%s] NMSE = %f ", ggml_op_desc(t1), err); @@ -447,8 +439,9 @@ struct test_case { return size; }; for (int i = 0; i < gf->n_nodes; i++) { - if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) + if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) { continue; + } mem += tensor_op_size(gf->nodes[i]); } @@ -1137,15 +1130,17 @@ struct test_sum_rows : public test_case { } }; +// Mixtral MOE struct test_moe : public test_case { - const int n_experts = 8; - const int n_experts_per_tok = 2; - const int n_tokens = 1; - const int n_embd = 4096; - const int n_ff = 14336; + const int n_experts; + const int n_experts_per_tok; + const int n_tokens; + const int n_embd; + const int n_ff; std::string op_desc(ggml_tensor * t) override { return "MOE"; + GGML_UNUSED(t); } @@ -1153,7 +1148,8 @@ struct test_moe : public test_case { return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff); } - test_moe() { + test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336) + : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) { } ggml_tensor * build_graph(ggml_context * ctx) override { @@ -1171,24 +1167,20 @@ struct test_moe : public test_case { ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); - ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_tokens, num_experts] - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd)); // [n_tokens, num_experts] + ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); + ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd)); // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok] + ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); ggml_tensor * weights = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts); - printf("get rows args %ld %ld %ld %ld, %ld %ld %ld %ld\n", - weights->src[0]->ne[0], weights->src[0]->ne[1], weights->src[0]->ne[2], weights->src[0]->ne[3], - weights->src[1]->ne[0], weights->src[1]->ne[1], weights->src[1]->ne[2], weights->src[1]->ne[3]); - - weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok] + weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens); ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); - weights = ggml_div(ctx, weights, weights_sum); // [n_tokens, num_experts_per_tok] + weights = ggml_div(ctx, weights, weights_sum); // compute expert outputs ggml_tensor * moe_out = nullptr; @@ -1202,9 +1194,9 @@ struct test_moe : public test_case { cur_gate = ggml_silu(ctx, cur_gate); - cur_expert = ggml_mul(ctx, cur_up, cur_gate); // [n_tokens, n_embd] + cur_expert = ggml_mul(ctx, cur_up, cur_gate); - cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert); cur_expert = ggml_mul(ctx, cur_expert, ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); @@ -1240,8 +1232,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K }; - test_cases.emplace_back(new test_moe()); - // unary ops for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { test_cases.emplace_back(new test_unary((ggml_unary_op) op)); @@ -1374,6 +1364,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_sum_rows()); + test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 14336)); + test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); + // run tests if (mode == MODE_TEST) { ggml_backend_t backend_cpu = ggml_backend_cpu_init(); @@ -1389,14 +1382,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op ggml_backend_free(backend_cpu); return n_ok == test_cases.size(); - } else if (mode == MODE_PERF) { + } + + if (mode == MODE_PERF) { for (auto & test : test_cases) { test->eval_perf(backend, op_name); } return true; - } else { - GGML_ASSERT(false); } + + GGML_ASSERT(false); + return false; } static void usage(char ** argv) { @@ -1469,11 +1465,12 @@ int main(int argc, char ** argv) { } printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count()); + if (n_ok != ggml_backend_reg_get_count()) { printf("\033[1;31mFAIL\033[0m\n"); return 1; - } else { - printf("\033[1;32mOK\033[0m\n"); - return 0; } + + printf("\033[1;32mOK\033[0m\n"); + return 0; } From f1380d7897128724e07d15caf382c4986e4e9d27 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 10 Dec 2023 22:58:31 +0100 Subject: [PATCH 30/47] test-backend-ops : add cpy from f32 -> all types test --- ggml-cuda.cu | 25 ++++++++++++++++++++++++- tests/test-backend-ops.cpp | 24 ++++++++++++++++-------- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5a5b4e8ba3434..382897d59fce5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9316,6 +9316,30 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten return false; } } break; + case GGML_OP_CPY: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + return false; + } break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -9331,7 +9355,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: - case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 138fd298271c7..0273a4309fd1d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -70,23 +70,27 @@ static std::vector tensor_to_float(const ggml_tensor * t) { std::vector buf(ggml_nbytes(t)); ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t)); + ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); + // access elements by index to avoid gaps in views for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { for (int64_t i2 = 0; i2 < t->ne[2]; i2++) { for (int64_t i1 = 0; i1 < t->ne[1]; i1++) { - for (int64_t i0 = 0; i0 < t->ne[0]; i0++) { + for (int64_t i0 = 0; i0 < t->ne[0]; i0 += ggml_blck_size(t->type)) { size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; - float v; if (t->type == GGML_TYPE_F16) { - v = (float) ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]); + tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i])); } else if (t->type == GGML_TYPE_F32) { - v = *(float *) &buf[i]; + tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { - v = *(int32_t *) &buf[i]; + tv.push_back((float)*(int32_t *) &buf[i]); + } else if (ggml_is_quantized(t->type)) { + std::vector vq(ggml_blck_size(t->type)); + tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false); } - tv.push_back(v); } } } @@ -320,7 +324,7 @@ struct test_case { for (size_t i = 0; i < f1.size(); i++) { // check for nans if (std::isnan(f1[i]) || std::isnan(f2[i])) { - printf("[%s] NaN at index %zu ", ggml_op_desc(t1), i); + printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]); ud->ok = false; return true; } @@ -1253,7 +1257,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2})); test_cases.emplace_back(new test_dup()); - test_cases.emplace_back(new test_cpy()); + + for (ggml_type type : all_types) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 100, 100, 1})); + } + test_cases.emplace_back(new test_cont()); auto add_test_bin_bcast = [&](ggml_type type, std::array ne, std::array nr) { From b0029815e44d8f7402f97fe76473b9b541411d28 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 02:43:52 +0100 Subject: [PATCH 31/47] test-backend-ops : fix dequantize block offset --- tests/test-backend-ops.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 0273a4309fd1d..345805ea2e0e4 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -71,13 +71,14 @@ static std::vector tensor_to_float(const ggml_tensor * t) { ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t)); ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); + size_t bs = ggml_blck_size(t->type); // access elements by index to avoid gaps in views for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { for (int64_t i2 = 0; i2 < t->ne[2]; i2++) { for (int64_t i1 = 0; i1 < t->ne[1]; i1++) { - for (int64_t i0 = 0; i0 < t->ne[0]; i0 += ggml_blck_size(t->type)) { - size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; + for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) { + size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0]; if (t->type == GGML_TYPE_F16) { tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i])); } else if (t->type == GGML_TYPE_F32) { From 8cbaed1d9a1400576f8424920ca82f1d8c9404cc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Dec 2023 08:55:16 +0200 Subject: [PATCH 32/47] llama : fix hard-coded number of experts --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e2a01902e8920..b9216f957e8f4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3079,7 +3079,7 @@ static void llm_load_tensors( GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch - for (int x = 0; x < 8; ++x) { + for (uint32_t x = 0; x < hparams.n_expert; ++x) { layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); From ffda94c87f55e98aea7de5c61c7dba79c160a89f Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 12:15:31 +0100 Subject: [PATCH 33/47] test-backend-ops : simplify and disable slow tests to avoid CI timeout --- tests/test-backend-ops.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 345805ea2e0e4..e6e3e76501d1f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1260,7 +1260,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_dup()); for (ggml_type type : all_types) { - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 100, 100, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1})); } test_cases.emplace_back(new test_cont()); @@ -1298,8 +1298,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1}); + //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1}); + //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1}); test_cases.emplace_back(new test_scale()); @@ -1374,7 +1374,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 14336)); - test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); + //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); // run tests if (mode == MODE_TEST) { From 33e50f1b538a92555c88d8fa241ca24f65d4163e Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 12:27:48 +0100 Subject: [PATCH 34/47] test-backend-ops : disable MOE test with thread sanitizer --- tests/test-backend-ops.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e6e3e76501d1f..dcbc6d3803344 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1373,8 +1373,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_sum_rows()); +#if !defined(__SANITIZE_THREAD__) + // FIXME: these tests use too much memory with thread sanitizer test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 14336)); //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); +#endif // run tests if (mode == MODE_TEST) { From 296c945de5fa1d36aa3680b58a84096733869c04 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 16:53:25 +0100 Subject: [PATCH 35/47] cuda : fix mul_mat_id with multi gpu --- ggml-cuda.cu | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 382897d59fce5..9e1acd3f19e5f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8361,11 +8361,16 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.ne[1] = 1; dst_row.ne[1] = 1; - if (src1->backend == GGML_BACKEND_GPU) { - src1_row.extra = &src1_row_extra; - } + src1_row.nb[2] = src1_row.nb[1]; + dst_row.nb[2] = dst_row.nb[1]; + + src1_row.nb[3] = src1_row.nb[1]; + dst_row.nb[3] = dst_row.nb[1]; + + src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); @@ -8381,6 +8386,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.data = (char *) src1->data + i01*src1->nb[1]; dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); } From 7dc75e3923f1553175a89848ece21d9041c311e4 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 20:00:28 +0100 Subject: [PATCH 36/47] convert : use 1e6 rope_freq_base for mixtral --- convert.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 532c8b5abb86e..e1a73a652d69a 100755 --- a/convert.py +++ b/convert.py @@ -259,6 +259,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_experts = None n_experts_used = None + f_rope_freq_base = None # hack to determine LLaMA v1 vs v2 vs CodeLlama if config.get("moe"): @@ -281,6 +282,8 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] + f_rope_freq_base = 1e6 + return Params( n_vocab = model["tok_embeddings.weight"].shape[0], @@ -293,7 +296,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_experts = n_experts, n_experts_used = n_experts_used, f_norm_eps = config["norm_eps"], - f_rope_freq_base = config.get("rope_theta"), + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), ) @staticmethod From f1cbfabd642a18f6db0435ea67a3f5c890d801bc Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Dec 2023 20:02:55 +0100 Subject: [PATCH 37/47] convert : fix style --- convert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert.py b/convert.py index e1a73a652d69a..5949ddbe8f301 100755 --- a/convert.py +++ b/convert.py @@ -284,7 +284,6 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 - return Params( n_vocab = model["tok_embeddings.weight"].shape[0], n_embd = config["dim"], From 6a419f4d195bb1e3d0da9d8930712c7201b408dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 13:04:33 +0200 Subject: [PATCH 38/47] convert : support safetensors format --- convert.py | 14 ++++++++++++-- gguf-py/gguf/tensor_mapping.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/convert.py b/convert.py index 5949ddbe8f301..482858e453c95 100755 --- a/convert.py +++ b/convert.py @@ -42,6 +42,7 @@ ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 + # # data types # @@ -235,6 +236,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + n_experts = None + n_experts_used = None + + if "num_local_experts" in config: + n_experts = config["num_local_experts"] + n_experts_used = config["num_experts_per_tok"] + return Params( n_vocab = config["vocab_size"], n_embd = config["hidden_size"], @@ -243,6 +251,8 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: n_ff = config["intermediate_size"], n_head = (n_head := config["num_attention_heads"]), n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, f_norm_eps = config["rms_norm_eps"], f_rope_freq_base = config.get("rope_theta"), rope_scaling_type = rope_scaling_type, @@ -257,7 +267,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_experts = None + n_experts = None n_experts_used = None f_rope_freq_base = None @@ -280,7 +290,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] + n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 18f75cf69eeda..0115ea1c605b1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -150,7 +150,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral ), # Feed-forward up @@ -169,7 +170,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral ), # Feed-forward gate @@ -180,7 +182,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral ), # Feed-forward down @@ -198,7 +201,8 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral ), MODEL_TENSOR.ATTN_Q_NORM: ( From a742d9f9b7559db8043c3856e8c186cb3c03c0c8 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 12 Dec 2023 12:46:33 +0100 Subject: [PATCH 39/47] gguf-py : bump version --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index e6374bfe898a4..9789c2c877165 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.6.0" +version = "0.7.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 08eb99179a301850ed7aaaf1143e0e20ca50c234 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 14:14:15 +0200 Subject: [PATCH 40/47] metal : add cpy f16 -> f32 kernel --- convert.py | 10 +++++----- ggml-metal.m | 36 ++++++++++++++++++++++++++++++++---- ggml-metal.metal | 45 +++++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 8 ++++---- 4 files changed, 84 insertions(+), 15 deletions(-) diff --git a/convert.py b/convert.py index 482858e453c95..19387fda4dee4 100755 --- a/convert.py +++ b/convert.py @@ -63,10 +63,10 @@ class UnquantizedDataType(DataType): pass -DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) -DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) -DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) -DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) @dataclass(frozen=True) @@ -996,7 +996,7 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 diff --git a/ggml-metal.m b/ggml-metal.m index cca9244e6fbe7..8276d9fb6d716 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -155,6 +155,7 @@ //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); GGML_METAL_DECL_KERNEL(cpy_f16_f16); + GGML_METAL_DECL_KERNEL(cpy_f16_f32); GGML_METAL_DECL_KERNEL(concat); GGML_METAL_DECL_KERNEL(sqr); GGML_METAL_DECL_KERNEL(sum_rows); @@ -424,6 +425,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); GGML_METAL_ADD_KERNEL(cpy_f16_f16); + GGML_METAL_ADD_KERNEL(cpy_f16_f32); GGML_METAL_ADD_KERNEL(concat); GGML_METAL_ADD_KERNEL(sqr); GGML_METAL_ADD_KERNEL(sum_rows); @@ -539,6 +541,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); GGML_METAL_DEL_KERNEL(cpy_f16_f16); + GGML_METAL_DEL_KERNEL(cpy_f16_f32); GGML_METAL_DEL_KERNEL(concat); GGML_METAL_DEL_KERNEL(sqr); GGML_METAL_DEL_KERNEL(sum_rows); @@ -867,12 +870,37 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { case GGML_OP_ROPE: case GGML_OP_IM2COL: case GGML_OP_ARGSORT: - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: return true; + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_CONT: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + switch (op->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return true; + default: + return false; + } + case GGML_TYPE_F16: + switch (op->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return true; + default: + return false; + } + default: + return false; + }; + } case GGML_OP_DIAG_MASK_INF: { return op->ne[0] % 4 == 0; @@ -2021,7 +2049,7 @@ void ggml_metal_graph_compute( { switch (dstt) { case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; - case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break; + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break; default: GGML_ASSERT(false && "not implemented"); }; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 067c5779d757b..c246e86458745 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1698,8 +1698,8 @@ template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_ar template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; kernel void kernel_cpy_f16_f16( - device const half * src0, - device half * dst, + device const half * src0, + device half * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -1738,6 +1738,47 @@ kernel void kernel_cpy_f16_f16( } } +kernel void kernel_cpy_f16_f32( + device const half * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + kernel void kernel_cpy_f32_f16( device const float * src0, device half * dst, diff --git a/llama.cpp b/llama.cpp index b9216f957e8f4..cc45cf52a4cf2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4277,23 +4277,23 @@ struct llm_build_context { ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] cb(logits, "ffn_moe_logits", il); - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] + ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] cb(probs, "ffn_moe_probs", il); // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] + ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] cb(selected_experts->src[0], "ffn_moe_argsort", il); ggml_tensor * weights = ggml_get_rows(ctx0, ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); cb(weights, "ffn_moe_weights", il); - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); cb(weights_sum, "ffn_moe_weights_sum", il); - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] + weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] cb(weights, "ffn_moe_weights_norm", il); // compute expert outputs From a51bc0c1c05ef4189b9424bdf29e73cf25b26724 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 15:55:42 +0200 Subject: [PATCH 41/47] metal : fix binary ops for ne10 % 4 != 0 --- ggml-metal.m | 2 +- tests/test-backend-ops.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8276d9fb6d716..090c84f591c40 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1080,7 +1080,7 @@ void ggml_metal_graph_compute( int64_t nb = ne00; - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0) { + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { GGML_ASSERT(ggml_is_contiguous(src0)); // src1 is a row diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index dcbc6d3803344..8f4b470bc621c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1272,6 +1272,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op }; add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1}); From ea4402bb0e2e70737545b6869f9eff995b988a36 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 17:03:38 +0200 Subject: [PATCH 42/47] test-backend-ops : add one more sum_rows test --- tests/test-backend-ops.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8f4b470bc621c..44830b4d4da30 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1372,7 +1372,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); } - test_cases.emplace_back(new test_sum_rows()); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, {10, 10, 10, 10})); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, {2, 1, 1, 1})); #if !defined(__SANITIZE_THREAD__) // FIXME: these tests use too much memory with thread sanitizer From 90c12e6b3cebfa7fec9ab2bb239cf509d0b828a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 12 Dec 2023 20:05:58 +0200 Subject: [PATCH 43/47] ggml : do not use BLAS with ggml_mul_mat_id --- ggml.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 6f5493096bbc0..4dbacbb495cc1 100644 --- a/ggml.c +++ b/ggml.c @@ -9508,8 +9508,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) + // all the experts for each batch element and the processing would become incredibly slow // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && + if (dst->op != GGML_OP_MUL_MAT_ID && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && //src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && From 82e4f64578dc3db40185e6b91195e73c9e995952 Mon Sep 17 00:00:00 2001 From: Radek Pilar Date: Tue, 12 Dec 2023 20:04:10 +0100 Subject: [PATCH 44/47] convert-hf : support for mixtral-instruct (#4428) * convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct * convert : use sentencepiece tokenizer for Mixtral-instruct * convert : make flake8 happy --- convert-hf-to-gguf.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bced1f5617a0f..e46a7813a78e9 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -77,8 +77,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(n_embd) if (n_ff := self.hparams.get("intermediate_size")) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_head")) is not None: + if (n_head := self.hparams.get("num_attention_heads")) is not None: self.gguf_writer.add_head_count(n_head) + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + + if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) def write_tensors(self): @@ -170,6 +180,8 @@ def from_model_architecture(model_architecture): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel + if model_architecture == "MixtralForCausalLM": + return MixtralModel return Model def _is_model_safetensors(self) -> bool: @@ -207,6 +219,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN + if arch == "MixtralForCausalLM": + return gguf.MODEL_ARCH.LLAMA raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -837,6 +851,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(1e-5) +class MixtralModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + class QwenModel(Model): @staticmethod def token_bytes_to_string(b): From ab558ac2b3b892cfff5b9f1467bdc74a9d1d8d71 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 13 Dec 2023 10:54:17 +0200 Subject: [PATCH 45/47] metal : fix soft_max kernels ref: https://github.com/ggerganov/ggml/pull/621/commits/1914017863d2f9ab8ecc0281cc2a56d683668b92 --- ggml-metal.m | 4 +++- ggml-metal.metal | 16 ++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 090c84f591c40..1aa3424d21f4c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1269,6 +1269,8 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; if (id_src1) { [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; } [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; @@ -1520,7 +1522,7 @@ void ggml_metal_graph_compute( else if (src0t == GGML_TYPE_Q6_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { - int64_t ny = (ne11 + nrows - 1)/nrows; + const int64_t ny = (ne11 + nrows - 1)/nrows; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } diff --git a/ggml-metal.metal b/ggml-metal.metal index c246e86458745..8b76f969ce428 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -347,9 +347,9 @@ kernel void kernel_soft_max( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - device const float * pmask = src1 ? src1 + i01*ne00 : nullptr; - device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; + device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max float lmax = -INFINITY; @@ -386,6 +386,8 @@ kernel void kernel_soft_max( } float sum = simd_sum(lsum); + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ntg > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = 0.0f; @@ -428,9 +430,9 @@ kernel void kernel_soft_max_4( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr; - device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); // parallel max float4 lmax4 = -INFINITY; @@ -468,6 +470,8 @@ kernel void kernel_soft_max_4( } const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; + threadgroup_barrier(mem_flags::mem_threadgroup); + float sum = simd_sum(lsum); if (ntg > N_SIMDWIDTH) { if (sgitg == 0) { From 109e7aa8ac7e4afb8f38ac34ba8872bebbb3a026 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 13 Dec 2023 10:55:17 +0200 Subject: [PATCH 46/47] metal : limit kernels to not use more than the allowed threads --- ggml-metal.m | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 1aa3424d21f4c..1dcfa6eddbfa5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1080,6 +1080,8 @@ void ggml_metal_graph_compute( int64_t nb = ne00; + id pipeline = nil; + if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { GGML_ASSERT(ggml_is_contiguous(src0)); @@ -1088,21 +1090,23 @@ void ggml_metal_graph_compute( nb = ne00 / 4; switch (dst->op) { - case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break; - case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break; - case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break; + case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break; + case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break; + case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break; default: GGML_ASSERT(false); } bcast_row = true; } else { switch (dst->op) { - case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break; - case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break; - case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break; + case GGML_OP_ADD: pipeline = ctx->pipeline_add; break; + case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break; + case GGML_OP_DIV: pipeline = ctx->pipeline_div; break; default: GGML_ASSERT(false); } } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1137,7 +1141,7 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } else { - const int nth = MIN(1024, ne0); + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } From e1241d9b461816b679eaf6951631287687a18f66 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 13 Dec 2023 13:56:45 +0200 Subject: [PATCH 47/47] metal : switch to execution barriers + fix one of the barriers --- ggml-metal.metal | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml-metal.metal b/ggml-metal.metal index 8b76f969ce428..773fac124b0c4 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -385,8 +385,11 @@ kernel void kernel_soft_max( pdst[i00] = exp_psrc0; } + // This barrier fixes a failing test + // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 + threadgroup_barrier(mem_flags::mem_none); + float sum = simd_sum(lsum); - threadgroup_barrier(mem_flags::mem_threadgroup); if (ntg > N_SIMDWIDTH) { if (sgitg == 0) { @@ -470,9 +473,13 @@ kernel void kernel_soft_max_4( } const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - threadgroup_barrier(mem_flags::mem_threadgroup); + + // This barrier fixes a failing test + // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 + threadgroup_barrier(mem_flags::mem_none); float sum = simd_sum(lsum); + if (ntg > N_SIMDWIDTH) { if (sgitg == 0) { buf[tiisg] = 0.0f;