From a76e73b8445c443aa8ec249ffbe401715318147b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Dec 2023 13:30:20 +0200 Subject: [PATCH] llama : support quantum K cache (#4312) * llama : support quantum K cache (wip) * metal : add F32 -> Q8_0 copy kernel * cuda : add F32 -> Q8_0 copy kernel ggml-ci * cuda : use mmv kernel for quantum cache ops * llama : pass KV cache type through API * llama : fix build ggml-ci * metal : add F32 -> Q4_0 copy kernel * metal : add F32 -> Q4_1 copy kernel * cuda : wip * cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels * llama-bench : support type_k/type_v * metal : use mm kernel only for quantum KV cache * cuda : add comment * llama : remove memory_f16 and kv_f16 flags --------- Co-authored-by: slaren --- common/common.cpp | 44 ++++- common/common.h | 6 +- examples/llama-bench/llama-bench.cpp | 111 +++++++++--- examples/quantize-stats/quantize-stats.cpp | 1 - examples/server/server.cpp | 4 - ggml-cuda.cu | 186 ++++++++++++++++++-- ggml-metal.m | 32 +++- ggml-metal.metal | 192 +++++++++++++++++++++ llama.cpp | 62 +++++-- llama.h | 11 +- 10 files changed, 575 insertions(+), 74 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d379ceca0c7edd..3ced8f7da1be72 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -279,8 +279,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--memory-f32") { - params.memory_f16 = false; } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -499,6 +497,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.infill = true; } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { params.dump_kv_cache = true; + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + } else if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + } else if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; } else if (arg == "--multiline-input") { params.multiline_input = true; } else if (arg == "--simple-io") { @@ -799,8 +803,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); @@ -841,6 +843,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --verbose-prompt print prompt before generation\n"); printf(" -dkvc, --dump-kv-cache\n"); printf(" verbose print of the KV cache\n"); + printf(" -nkvo, --no-kv-offload\n"); + printf(" disable KV offload\n"); + printf(" -ctk TYPE, --cache-type-k TYPE\n"); + printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str()); + printf(" -ctv TYPE, --cache-type-v TYPE\n"); + printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str()); printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); @@ -905,6 +913,29 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & return mparams; } +static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + + throw std::runtime_error("Invalid cache type: " + s); +} + struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto cparams = llama_context_default_params(); @@ -914,7 +945,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; cparams.logits_all = params.logits_all; cparams.embedding = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -926,6 +956,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.type_k = kv_cache_type_from_str(params.cache_type_k); + cparams.type_v = kv_cache_type_from_str(params.cache_type_v); + return cparams; } @@ -1337,7 +1370,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); diff --git a/common/common.h b/common/common.h index 74244b3d52dae1..d92c89b62170ab 100644 --- a/common/common.h +++ b/common/common.h @@ -106,7 +106,6 @@ struct gpt_params { size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS - bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode @@ -132,9 +131,12 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes + std::string cache_type_k = "f16"; // KV cache data type for the K + std::string cache_type_v = "f16"; // KV cache data type for the V + // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector - std::string image = ""; // path to an image file + std::string image = ""; // path to an image file }; bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a3c783c57fef10..d0f96946c61e28 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -54,6 +54,13 @@ static std::vector split(const std::string & str, char delim) { return values; } +template +static std::vector transform_to_str(const std::vector & values, F f) { + std::vector str_values; + std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); + return str_values; +} + template static T avg(const std::vector & v) { if (v.empty()) { @@ -127,7 +134,8 @@ struct cmd_params { std::vector n_prompt; std::vector n_gen; std::vector n_batch; - std::vector f32_kv; + std::vector type_k; + std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; std::vector main_gpu; @@ -143,7 +151,8 @@ static const cmd_params cmd_params_defaults = { /* n_prompt */ {512}, /* n_gen */ {128}, /* n_batch */ {512}, - /* f32_kv */ {false}, + /* type_k */ {GGML_TYPE_F16}, + /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, /* main_gpu */ {0}, @@ -163,7 +172,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); + printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); @@ -174,9 +184,32 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); +} +static ggml_type ggml_type_from_name(const std::string & s) { + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + + return GGML_TYPE_COUNT; } + static cmd_params parse_cmd_params(int argc, char ** argv) { cmd_params params; std::string arg; @@ -225,13 +258,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "--memory-f32") { + } else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; break; } - auto p = split(argv[i], split_delim); - params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end()); + auto p = split(argv[i], split_delim); + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + params.type_k.insert(params.type_k.end(), types.begin(), types.end()); + } else if (arg == "-ctv" || arg == "--cache-type-v") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + params.type_v.insert(params.type_v.end(), types.begin(), types.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -322,7 +380,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; } + if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } + if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } @@ -337,7 +396,8 @@ struct cmd_params_instance { int n_prompt; int n_gen; int n_batch; - bool f32_kv; + ggml_type type_k; + ggml_type type_v; int n_threads; int n_gpu_layers; int main_gpu; @@ -366,7 +426,8 @@ struct cmd_params_instance { cparams.n_ctx = n_prompt + n_gen; cparams.n_batch = n_batch; - cparams.f16_kv = !f32_kv; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.mul_mat_q = mul_mat_q; return cparams; @@ -381,7 +442,8 @@ static std::vector get_cmd_params_instances_int(const cmd_p for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) + for (const auto & tk : params.type_k) + for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { @@ -389,7 +451,8 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_prompt = */ n_prompt, /* .n_gen = */ n_gen, /* .n_batch = */ nb, - /* .f32_kv = */ fk, + /* .type_k = */ tk, + /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, @@ -411,7 +474,8 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) + for (const auto & tk : params.type_k) + for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { @@ -423,7 +487,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, /* .n_batch = */ nb, - /* .f32_kv = */ fk, + /* .type_k = */ tk, + /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, @@ -442,7 +507,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ 0, /* .n_gen = */ n_gen, /* .n_batch = */ nb, - /* .f32_kv = */ fk, + /* .type_k = */ tk, + /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, @@ -490,7 +556,8 @@ struct test { uint64_t model_n_params; int n_batch; int n_threads; - bool f32_kv; + ggml_type type_k; + ggml_type type_v; int n_gpu_layers; int main_gpu; bool mul_mat_q; @@ -509,7 +576,8 @@ struct test { model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; n_threads = inst.n_threads; - f32_kv = inst.f32_kv; + type_k = inst.type_k; + type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; mul_mat_q = inst.mul_mat_q; @@ -572,7 +640,7 @@ struct test { "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_threads", "f16_kv", + "n_batch", "n_threads", "type_k", "type_v", "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", @@ -622,7 +690,7 @@ struct test { std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), + std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -806,8 +874,11 @@ struct markdown_printer : public printer { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { fields.push_back("n_batch"); } - if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) { - fields.push_back("f16_kv"); + if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { + fields.push_back("type_k"); + } + if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { + fields.push_back("type_v"); } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { fields.push_back("main_gpu"); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index dd76b1ceef134d..7fb61d26a7115d 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -322,7 +322,6 @@ int main(int argc, char ** argv) { auto cparams = llama_context_default_params(); cparams.n_ctx = 256; cparams.seed = 1; - cparams.f16_kv = false; ctx = llama_new_context_with_model(model, cparams); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3ab73545843d1c..44ecba5fd99cd4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2109,10 +2109,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.yarn_beta_slow = std::stof(argv[i]); } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { if (++i >= argc) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5bab74c3a9b2af..b8e0663acaa4ea 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7,6 +7,7 @@ #include #include #include +#include #if defined(GGML_USE_HIPBLAS) #include @@ -4560,6 +4561,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, cpy_1(cx + x_offset, cdst + dst_offset); } +static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = fmaxf(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j]*id; + + dsti->qs[j] = roundf(x0); + } +} + +static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK4_0/2 + j]*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->dm.x = d; + dsti->dm.y = vmin; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (xi[0 + j] - vmin)*id; + const float x1 = (xi[QK4_1/2 + j] - vmin)*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) { + const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; + + if (i >= ne) { + return; + } + + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = (i - i02*ne01*ne00 - i01*ne00); + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / max(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); @@ -5738,6 +5849,39 @@ static void ggml_cpy_f32_f16_cuda( (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); } +static void ggml_cpy_f32_q8_0_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void ggml_cpy_f32_q4_0_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void ggml_cpy_f32_q4_1_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + static void ggml_cpy_f16_f16_cuda( const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, @@ -6082,20 +6226,21 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( const enum ggml_type type = src->type; const int64_t ts = ggml_type_size(type); const int64_t bs = ggml_blck_size(type); - int64_t i1_diff = i1_high - i1_low; + const int64_t i1_diff = i1_high - i1_low; const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; - if (nb0 == ts && nb1 == ts*ne0/bs) { + if (nb0 == ts && nb1 == ts*(ne0/bs)) { return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); } if (nb0 == ts) { - return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); + return cudaMemcpy2DAsync(dst_ptr, ts*(ne0/bs), x, nb1, ts*(ne0/bs), i1_diff, kind, stream); } + GGML_ASSERT(bs == 1 && "TODO: implement bs != 1"); for (int64_t i1 = 0; i1 < i1_diff; i1++) { const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + void * rd = (void *) (dst_ptr + i1*ts*ne0); // pretend the row is a matrix with cols=1 - cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); + cudaError_t r = cudaMemcpy2DAsync(rd, ts, rx, nb0, ts, ne0, kind, stream); if (r != cudaSuccess) { return r; } } return cudaSuccess; @@ -6463,6 +6608,8 @@ inline void ggml_cuda_op_mul_mat_vec_q( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const cudaStream_t & stream) { + GGML_ASSERT(ggml_nrows(src1) == 1); + const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; @@ -6522,7 +6669,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( size_t ash; dfloat * src1_dfloat = nullptr; // dfloat == half - bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; @@ -7092,10 +7240,9 @@ static void ggml_cuda_op_mul_mat( const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); - const bool src1_is_contiguous = ggml_is_contiguous(src1); - const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ? - ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; + + const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); @@ -7220,7 +7367,7 @@ static void ggml_cuda_op_mul_mat( const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; // for split tensors the data begins at i0 == i0_offset_low - char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs; + char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); @@ -7687,10 +7834,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; #endif // GGML_CUDA_FORCE_DMMV if (use_mul_mat_vec_q) { + // NOTE: this kernel does not support ggml_nrows(src1) > 1 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); } else { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); @@ -7762,14 +7910,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); @@ -7780,6 +7931,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg } static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // TODO: why do we pass dst as src1 here? ggml_cuda_cpy(src0, dst, nullptr); (void) src1; } diff --git a/ggml-metal.m b/ggml-metal.m index d82598d3b80bcc..65e1a240fce59b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -118,6 +118,11 @@ GGML_METAL_DECL_KERNEL(im2col_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); + GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); + GGML_METAL_DECL_KERNEL(cpy_f32_q4_0); + GGML_METAL_DECL_KERNEL(cpy_f32_q4_1); + //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); + //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); GGML_METAL_DECL_KERNEL(cpy_f16_f16); GGML_METAL_DECL_KERNEL(concat); GGML_METAL_DECL_KERNEL(sqr); @@ -324,6 +329,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(im2col_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); + GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); + GGML_METAL_ADD_KERNEL(cpy_f32_q4_0); + GGML_METAL_ADD_KERNEL(cpy_f32_q4_1); + //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); + //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); GGML_METAL_ADD_KERNEL(cpy_f16_f16); GGML_METAL_ADD_KERNEL(concat); GGML_METAL_ADD_KERNEL(sqr); @@ -425,6 +435,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(im2col_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f16); GGML_METAL_DEL_KERNEL(cpy_f32_f32); + GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); + GGML_METAL_DEL_KERNEL(cpy_f32_q4_0); + GGML_METAL_DEL_KERNEL(cpy_f32_q4_1); + //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); + //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); GGML_METAL_DEL_KERNEL(cpy_f16_f16); GGML_METAL_DEL_KERNEL(concat); GGML_METAL_DEL_KERNEL(sqr); @@ -1114,7 +1129,7 @@ void ggml_metal_graph_compute( !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && ne00 % 32 == 0 && ne00 >= 64 && - ne11 > ne11_mm_min) { + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; @@ -1549,14 +1564,23 @@ void ggml_metal_graph_compute( case GGML_OP_CPY: case GGML_OP_CONT: { - const int nth = MIN(1024, ne00); + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); switch (src0t) { case GGML_TYPE_F32: { + GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0); + switch (dstt) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break; + //case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break; + //case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break; default: GGML_ASSERT(false && "not implemented"); }; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 9a79f815f3a720..9f5ffcbafe8fc0 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3,6 +3,7 @@ using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) #define QK4_0 32 #define QR4_0 2 @@ -1460,6 +1461,197 @@ kernel void kernel_cpy_f32_f32( } } +kernel void kernel_cpy_f32_q8_0( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0; + + device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = src[j]; + amax = MAX(amax, fabs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK8_0].d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = src[j]*id; + + dst_data[i00/QK8_0].qs[j] = round(x0); + } + } +} + +kernel void kernel_cpy_f32_q4_0( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0; + + device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK4_0].d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_0/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + dst_data[i00/QK4_0].qs[j] = xi0; + dst_data[i00/QK4_0].qs[j] |= xi1 << 4; + } + } +} + +kernel void kernel_cpy_f32_q4_1( + device const float * src0, + device void * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1; + + device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < QK4_1; j++) { + const float v = src[j]; + if (min > v) min = v; + if (max < v) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst_data[i00/QK4_1].d = d; + dst_data[i00/QK4_1].m = min; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK4_1/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + dst_data[i00/QK4_1].qs[j] = xi0; + dst_data[i00/QK4_1].qs[j] |= xi1 << 4; + } + } +} + kernel void kernel_concat( device const char * src0, device const char * src1, diff --git a/llama.cpp b/llama.cpp index a97a110657fc55..89cf0a75aba245 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1250,6 +1250,7 @@ struct llama_cparams { float yarn_beta_slow; bool mul_mat_q; + bool offload_kqv; }; struct llama_layer { @@ -1528,7 +1529,8 @@ struct llama_context { static bool llama_kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, - ggml_type wtype, + ggml_type ktype, + ggml_type vtype, uint32_t n_ctx, int n_gpu_layers) { const uint32_t n_embd = hparams.n_embd_gqa(); @@ -1546,7 +1548,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); + cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; @@ -1566,8 +1568,17 @@ static bool llama_kv_cache_init( ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.v, "cache_v"); - (void) n_gpu_layers; + const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start); + GGML_UNUSED(offload); + + for (int i = 0; i < (int) n_layer; i++) { + ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + cache.k_l.push_back(k); + cache.v_l.push_back(v); #ifdef GGML_USE_CUBLAS if (ggml_cublas_loaded()) { size_t vram_kv_cache = 0; @@ -3646,9 +3657,9 @@ static void llm_build_k_shift( ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv.k, n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv.k)*n_embd_head, - ggml_element_size(kv.k)*n_embd_gqa, - ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il), + ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted", il); @@ -3675,8 +3686,8 @@ static void llm_build_kv_store( //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed cb(v_cur_t, "v_cur_t", il); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, + (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); cb(k_cache_view, "k_cache_view", il); struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa, @@ -3835,9 +3846,9 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k, n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv.k)*n_embd_gqa, - ggml_element_size(kv.k)*n_embd_head, - ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il); + ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + 0); cb(k, "k", il); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); @@ -5775,6 +5786,7 @@ static int llama_decode_internal( // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); @@ -8895,8 +8907,9 @@ struct llama_context_params llama_context_default_params() { /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, /*.yarn_orig_ctx =*/ 0, + /*.type_k =*/ GGML_TYPE_F16, + /*.type_v =*/ GGML_TYPE_F16, /*.mul_mat_q =*/ true, - /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, }; @@ -9048,19 +9061,36 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + const ggml_type type_k = params.type_k; + const ggml_type type_v = params.type_v; + + GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0); + GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0); // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { - const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0); + size_t memory_size_k = 0; + size_t memory_size_v = 0; + + for (auto & k : ctx->kv_self.k_l) { + memory_size_k += ggml_nbytes(k); + } + + for (auto & v : ctx->kv_self.v_l) { + memory_size_v += ggml_nbytes(v); + } + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } // resized during inference diff --git a/llama.h b/llama.h index 77efc2d73c5b1e..4548a312be1c9e 100644 --- a/llama.h +++ b/llama.h @@ -191,11 +191,14 @@ extern "C" { float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size + enum ggml_type type_k; // data type for K cache + enum ggml_type type_v; // data type for V cache + // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache, fp32 otherwise - bool logits_all; // the llama_eval() call computes all logits, not just the last one - bool embedding; // embedding mode only + bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool embedding; // embedding mode only }; // model quantization parameters