From f365ea71dc45b3aaa2e0809d1c3ac55d79421237 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Wed, 12 Jun 2019 00:35:09 +0000 Subject: [PATCH] [Relay] [Quantization] WIP - Protoyping the quantized convolution op Goal - Act as medium of discussion for pull request #2351 Features - New quantized conv2D op in Relay - Python API interface to instantiate the Relay op - Infer Type implemented - Lowering of quantized_conv op to low-level Relay ops Discussion points - Does the namespace look correct? - Relay op is called 'relay.op.nn._quantize.quantized_conv2d' - Idea is that any op under '_quantize' namespace will go through rewrite. - Should we reuse Conv2DRel and Conv2DAttrs - Tried protoyping. Found it hard to derive from Conv2DAttr struct - Infer Type has a param field. This need to come from the right datatype. Missing implememtation - Lowering of quantized conv into conv+cast is incomplete. - Will work on it async. This is orthogonal to the discussion. --- include/tvm/relay/attrs/nn_quantize.h | 3 + src/relay/pass/quantize_rewrite.cc | 118 +++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 3 deletions(-) diff --git a/include/tvm/relay/attrs/nn_quantize.h b/include/tvm/relay/attrs/nn_quantize.h index 07029e8d76e2d..9cfa87d52507e 100644 --- a/include/tvm/relay/attrs/nn_quantize.h +++ b/include/tvm/relay/attrs/nn_quantize.h @@ -52,6 +52,7 @@ struct QuantizedConv2DAttrs : public tvm::AttrsNode { double input_scale; double kernel_scale; double output_scale; + bool use_integer_computation_for_scale_handling; TVM_DECLARE_ATTRS(QuantizedConv2DAttrs, "relay.attrs.QuantizedConv2DAttrs") { TVM_ATTR_FIELD(strides).set_default(Array({1, 1})) @@ -106,6 +107,8 @@ struct QuantizedConv2DAttrs : public tvm::AttrsNode { .describe("The scale of the kernel tensor."); TVM_ATTR_FIELD(output_scale) .describe("The scale of the output tensor."); + TVM_ATTR_FIELD(use_integer_computation_for_scale_handling).set_default(false) + .describe("When true, the integer computation is used to handle output scale"); } diff --git a/src/relay/pass/quantize_rewrite.cc b/src/relay/pass/quantize_rewrite.cc index 38022dd15f3c7..a64c3e758a2bb 100644 --- a/src/relay/pass/quantize_rewrite.cc +++ b/src/relay/pass/quantize_rewrite.cc @@ -31,12 +31,62 @@ namespace tvm { namespace relay { +Expr ConvolveQuantizedTensors(const Expr& quantized_data, + const Expr& quantized_kernel, const QuantizedConv2DAttrs*& param) { + // TODO (janimesh) - Who should decide the accumulation dtype? + if (param->input_zero_point == 0 && param->kernel_zero_point == 0) { + Expr int8_conv = Conv2D(quantized_data, + quantized_kernel, + param->strides, + param->padding, + param->dilation, + param->groups, + param->channels, + param->kernel_size, + param->data_layout, + param->kernel_layout, + param->out_layout, + Int(32)); + return int8_conv; + } + LOG(FATAL) << "Only symmetric quantization supported"; + return Expr(); // to hide the warning. +} + +Expr ScaleHandling(const Expr& convolved_tensor, + const QuantizedConv2DAttrs*& param) { + // The scale handling can be done in many ways. + // 1) Floating point handling + // Here we can multiply the scale to the convolved_tensor, round to nearest + // integer and then cast back to int32. + // 2) Integer only scale handling + // Here, the computation is converted to a fixed point computation by + // computing output multiplier and shift. This is useful, if the target + // device does not support/have very expensive floating point computations. + + if (param->use_integer_computation_for_scale_handling == false) { + double multiplier = (param->input_scale * param->kernel_scale) / + param->output_scale; + auto scalar_multiplier = MakeConstantScalar(Float(32), multiplier); + auto casted_convolved_tensor = Cast(convolved_tensor, Float(32)); + auto scaled_fp32_tensor = Multiply(casted_convolved_tensor, scalar_multiplier); + auto scaled_rounded_fp32_tensor = Round(scaled_fp32_tensor); + auto scaled_tensor = Cast(scaled_rounded_fp32_tensor, Int(32)); + return scaled_tensor; + } + LOG(FATAL) << "Only floating point scale handling is supported for now."; + return Expr(); // to hide the warning. +} + +Expr ReQuantize(const Expr& scaled_output, + const QuantizedConv2DAttrs*& param) { + Expr requantized_output = Cast(scaled_output, param->out_dtype); + return requantized_output; +} + Expr QuantizedConv2DForwardRewrite(const Call& ref_call, const Array& new_args, const NodeRef& ctx) { - // TODO(janimesh) - This is not the right calculation. This only serves as a - // prototype to discuss the flow of lowering of quantization ops and - // namespaces. CHECK_EQ(new_args.size(), 2); Expr quantized_data = new_args[0]; Expr quantized_kernel = new_args[1]; @@ -62,6 +112,68 @@ Expr QuantizedConv2DForwardRewrite(const Call& ref_call, // TODO(janimesh) - Look at the literature and use the right scale // calculations. return int8_conv; + + // Check for current quantization support. + CHECK_EQ(param->input_zero_point, 0) + << "Encountered non-zero zero point." + << " Only symmetric quantization supported for now."; + CHECK_EQ(param->kernel_zero_point, 0) + << "Encountered non-zero zero point." + << " Only symmetric quantization supported for now."; + CHECK_EQ(param->output_zero_point, 0) + << "Encountered non-zero zero point." + << " Only symmetric quantization supported for now."; + CHECK_EQ(param->use_integer_computation_for_scale_handling, false) + << "Currently floating point computation is used for scale handling. " + << "Please switch to False if HW supports floating point arithmetic"; + + // Lowering of the quantized_convolution. + // + // For FP32, the conv output is + // C = conv(A, W) + // or, C(n, oc, oh, ow) = A(n, ic, oh + r, ow + s) * W(oc, ic, r, s) + // where, ic, r, s are reduce axis. + // + // For quantized convolution, each tensor is represented in quantized format + // A = scale_a x (QA - zp_A) + // where QA is quantized tensor, scale_a and zp_A are quantizations params. + // + // For symmetric quantization, the zp_* for all tensors is 0. + // So, the quantized_convolution becomes + // + // scale_c * QC(n, oc, oh, ow) = + // scale_a * QA(n, ic, oh + r, ow + s) x + // scale_w * QW(oc, ic, r, s) + // + // So, to get the quantized tensor C, the computation is + // + // QC(n, oc, oh, ow) = (scale_a * scale_w)/scale_c x + // QA(n, ic, oh + r, ow + s) x QW(oc, ic, r, s) + // + // or, + // QC = K * conv(QA, QB) + // + // For asymmetric computation, we can perform similar unrolling. We can find + // more details at + // https://discuss.tvm.ai/t/tf-lite-quantized-conv2d-operator-conversion/2651/8?u=janimesh + + // The above computation is arranged in following functions + // 1) ConvolveQuantizedTensors + // a) For symmetric, conv(QA, QB). + // b) For asymmetric, it involves 4 terms. + // 2) ScaleHandling + // a) Takes convolved output and scales it. + // b) Can support both float and integer computation. + // 3) Requantize + // a) Converts the intermediate dtype back to int8. + Expr convolved_tensor = ConvolveQuantizedTensors(quantized_data, + quantized_kernel, + param); + Expr scaled_output = ScaleHandling(convolved_tensor, param); + Expr requantized_output = ReQuantize(scaled_output, param); + // TODO(janimesh) - Look at the literature and use the right scale + // calculations. + return requantized_output; } RELAY_REGISTER_OP("nn_quantized.quantized_conv2d")