Skip to content

Commit

Permalink
quantize gemm
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Sep 20, 2024
1 parent 86b03df commit 7f2d1da
Showing 1 changed file with 109 additions and 0 deletions.
109 changes: 109 additions & 0 deletions tools/quantize/ncnn2int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class NetQuantize : public ModelWriter
int quantize_gru();

int quantize_embed();
int quantize_gemm();

int fuse_requantize();
};
Expand Down Expand Up @@ -613,6 +614,113 @@ int NetQuantize::quantize_embed()
return 0;
}

int NetQuantize::quantize_gemm()
{
for (size_t i = 0; i < layers.size(); i++)
{
if (layers[i]->type != "Gemm")
continue;

// Gemm - quantize weight from fp32 to int8
ncnn::Gemm* gemm = (ncnn::Gemm*)layers[i];

fprintf(stderr, "quantize_gemm %s\n", gemm->name.c_str());

// TODO move to ncnn2table

if (gemm->constantA)
{
if (gemm->transA == 1)
{
// transpose for easier quantization
ncnn::Mat A_data_transposed(gemm->constantK * gemm->constantM);
for (int i = 0; i < gemm->constantM; i++)
{
float* ptr = (float*)A_data_transposed + i * gemm->constantK;
for (int j = 0; j < gemm->constantK; j++)
{
ptr[j] = gemm->A_data[j * gemm->constantM + i];
}
}
gemm->A_data = A_data_transposed;
gemm->transA = 0;
}

gemm->A_data_int8_scales.create(gemm->constantM);
for (int i = 0; i < gemm->constantM; i++)
{
float absmax = 0.f;

const float* ptr = (const float*)gemm->A_data + i * gemm->constantK;
for (int j = 0; j < gemm->constantK; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

gemm->A_data_int8_scales[i] = absmax == 0.f ? 1.f : 127 / absmax;
}

ncnn::Mat A_data = gemm->A_data.reshape(gemm->constantK, gemm->constantM);
ncnn::Mat A_data_int8;

ncnn::Option opt_q = opt;
opt_q.blob_allocator = A_data.allocator;
opt_q.use_packing_layout = false;
ncnn::quantize_to_int8(A_data, A_data_int8, gemm->A_data_int8_scales, opt_q);
if (A_data_int8.empty())
return -100;

gemm->A_data = A_data_int8.reshape(gemm->constantK * gemm->constantM);
}

if (gemm->constantB)
{
if (gemm->transB == 0)
{
// transpose for easier quantization
ncnn::Mat B_data_transposed(gemm->constantK * gemm->constantN);
for (int i = 0; i < gemm->constantN; i++)
{
float* ptr = (float*)B_data_transposed + i * gemm->constantK;
for (int j = 0; j < gemm->constantK; j++)
{
ptr[j] = gemm->B_data[j * gemm->constantN + i];
}
}
gemm->B_data = B_data_transposed;
gemm->transB = 1;
}

const float* ptr = gemm->B_data;
float absmax = 0.f;
for (int j = 0; j < gemm->B_data.w; j++)
{
absmax = std::max(absmax, (float)fabs(ptr[j]));
}

gemm->B_data_int8_scale = absmax == 0.f ? 1.f : 127 / absmax;

ncnn::Mat B_data_int8_scales(1);
B_data_int8_scales[0] = gemm->B_data_int8_scale;

ncnn::Mat B_data_int8;

ncnn::Option opt_q = opt;
opt_q.blob_allocator = gemm->B_data.allocator;
opt_q.use_packing_layout = false;
ncnn::quantize_to_int8(gemm->B_data, B_data_int8, B_data_int8_scales, opt_q);
if (B_data_int8.empty())
return -100;

gemm->B_data = B_data_int8;
}

gemm->int8_scale_term = 2;
}

return 0;
}

int NetQuantize::fuse_requantize()
{
const size_t layer_count = layers.size();
Expand Down Expand Up @@ -861,6 +969,7 @@ int main(int argc, char** argv)
quantizer.quantize_lstm();
quantizer.quantize_gru();
quantizer.quantize_embed();
quantizer.quantize_gemm();

quantizer.fuse_requantize();

Expand Down

0 comments on commit 7f2d1da

Please sign in to comment.