diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml index 990690e0c5b7..d49da39a0afc 100644 --- a/.ci/pnnx.yml +++ b/.ci/pnnx.yml @@ -4,12 +4,14 @@ on: branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' mr: target-branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' concurrency: diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 7594c0843acb..de4d6b428e99 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -837,11 +837,13 @@ y = embedding(x) | 1 | input_dim | int | 0 | | | 2 | bias_term | int | 0 | | | 3 | weight_data_size | int | 0 | | +| 18 | int8_scale_term| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float | [weight_data_size] | | bias_term | float | [num_output] | +| weight_data_int8_scales| float | [1] | # Exp ``` diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp index ddda6b8bf199..2b9f8a60042c 100644 --- a/src/layer/embed.cpp +++ b/src/layer/embed.cpp @@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd) input_dim = pd.get(1, 0); bias_term = pd.get(2, 0); weight_data_size = pd.get(3, 0); + int8_scale_term = pd.get(18, 0); return 0; } @@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb) return -100; } +#if NCNN_INT8 + if (int8_scale_term) + { + weight_data_int8_scale = mb.load(1, 1)[0]; + } +#endif // NCNN_INT8 + return 0; } -int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) { - int words = static_cast(bottom_blob.total()); + const int num_output = top_blob.w; + const int words = top_blob.h; - top_blob.create(num_output, words, 4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float* bias_ptr = bias_data; - // num_output #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < words; q++) { @@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con const float* em = (const float*)weight_data + num_output * word_index; - memcpy(outptr, em, num_output * sizeof(float)); + if (bias_ptr) + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] + bias_ptr[p]; + } + } + else + { + memcpy(outptr, em, num_output * sizeof(float)); + } + } +} + +#if NCNN_INT8 +static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) +{ + const int num_output = top_blob.w; + const int words = top_blob.h; + + const float* bias_ptr = bias_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < words; q++) + { + float* outptr = top_blob.row(q); + + int word_index = ((const int*)bottom_blob)[q]; - if (bias_term) + if (word_index < 0) + word_index = 0; + if (word_index >= input_dim) + word_index = input_dim - 1; + + const float descale_em = 1.f / weight_data_int8_scale; + + const signed char* em = (const signed char*)weight_data + num_output * word_index; + + if (bias_ptr) { for (int p = 0; p < num_output; p++) { - outptr[p] += bias_data[p]; + outptr[p] = em[p] * descale_em + bias_ptr[p]; } } + else + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] * descale_em; + } + } + } +} +#endif // NCNN_INT8 + +int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int words = static_cast(bottom_blob.total()); + + top_blob.create(num_output, words, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if NCNN_INT8 + if (int8_scale_term) + { + embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt); + } + else +#endif // NCNN_INT8 + { + embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt); } return 0; diff --git a/src/layer/embed.h b/src/layer/embed.h index 8e2366567163..b94c2b17bee4 100644 --- a/src/layer/embed.h +++ b/src/layer/embed.h @@ -38,9 +38,15 @@ class Embed : public Layer int weight_data_size; + int int8_scale_term; + // model Mat weight_data; Mat bias_data; + +#if NCNN_INT8 + float weight_data_int8_scale; +#endif }; } // namespace ncnn diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6c8939fc7c7e..e2ddc32a00dc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout) ncnn_add_layer_test(Einsum) ncnn_add_layer_test(Eltwise) ncnn_add_layer_test(ELU) +ncnn_add_layer_test(Embed) ncnn_add_layer_test(Erf) ncnn_add_layer_test(ExpandDims) ncnn_add_layer_test(Flatten) diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp new file mode 100644 index 000000000000..9c007ee5d7e7 --- /dev/null +++ b/tests/test_embed.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_embed(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(num_output * input_dim); + if (bias) + weights[1] = RandomMat(num_output); + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_0() +{ + return 0 + || test_embed(128, 128, 128, 0) + || test_embed(128, 128, 128, 1) + || test_embed(127, 127, 127, 0) + || test_embed(127, 127, 127, 1) + || test_embed(124, 124, 124, 0) + || test_embed(124, 124, 124, 1); +} + +#if NCNN_INT8 +static int test_embed_int8(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + pd.set(18, 2); + + std::vector weights(bias ? 3 : 2); + weights[0] = RandomS8Mat(num_output * input_dim); + if (bias) + { + weights[1] = RandomMat(num_output); + weights[2] = RandomMat(1, 100.f, 200.f); + } + else + { + weights[1] = RandomMat(1, 100.f, 200.f); + } + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_1() +{ + return 0 + || test_embed_int8(128, 128, 128, 0) + || test_embed_int8(128, 128, 128, 1) + || test_embed_int8(127, 127, 127, 0) + || test_embed_int8(127, 127, 127, 1) + || test_embed_int8(124, 124, 124, 0) + || test_embed_int8(124, 124, 124, 1); +} +#endif // NCNN_INT8 + +int main() +{ + SRAND(7767517); + +#if NCNN_INT8 + return test_embed_0() || test_embed_1(); +#else + return test_embed_0(); +#endif +} diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 4f445cfe2a4d..39157c453ece 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 1=%d", input_dim) fprintf_param_value(" 2=%d", bias_term) fprintf_param_value(" 3=%d", weight_data_size) + fprintf_param_value(" 18=%d", int8_scale_term) fwrite_weight_tag_data(op->weight_data, bp); fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + ncnn::Mat weight_data_int8_scales(1); + weight_data_int8_scales[0] = op->weight_data_int8_scale; + fwrite_weight_data(weight_data_int8_scales, bp, 90, 100); + } +#endif // NCNN_INT8 } else if (layer->type == "Exp") { diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 4d19ceb6f166..5e92b333aa57 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter int quantize_lstm(); int quantize_gru(); + int quantize_embed(); + int fuse_requantize(); }; @@ -562,6 +564,55 @@ int NetQuantize::quantize_gru() return 0; } +int NetQuantize::quantize_embed() +{ + for (size_t i = 0; i < layers.size(); i++) + { + if (layers[i]->type != "Embed") + continue; + + // Embed - quantize weight from fp32 to int8 + ncnn::Embed* embed = (ncnn::Embed*)layers[i]; + + fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); + + // TODO move to ncnn2table + + const int num_output = embed->num_output; + const int input_dim = embed->input_dim; + + ncnn::Mat weight_data_int8_scales(1); + { + const float* ptr = embed->weight_data; + float absmax = 0.f; + for (int i = 0; i < embed->weight_data.w; i++) + { + absmax = std::max(absmax, (float)fabs(ptr[i])); + } + + weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + ncnn::Mat weight_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = embed->weight_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + embed->weight_data = weight_data_int8; + } + + embed->int8_scale_term = 2; + embed->weight_data_int8_scale = weight_data_int8_scales[0]; + } + + return 0; +} + int NetQuantize::fuse_requantize() { const size_t layer_count = layers.size(); @@ -809,6 +860,7 @@ int main(int argc, char** argv) quantizer.quantize_rnn(); quantizer.quantize_lstm(); quantizer.quantize_gru(); + quantizer.quantize_embed(); quantizer.fuse_requantize();