Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QNN EP] Update to QNN SDK 2.24.0 #21463

Merged
merged 9 commits into from
Jul 24, 2024
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include <cassert>
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/framework/tensorprotoutils.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/common/safeint.h"
Expand All @@ -24,6 +26,11 @@ class LayerNormOpBuilder : public BaseOpBuilder {
const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;

protected:
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const override ORT_MUST_USE_RESULT;
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down Expand Up @@ -55,6 +62,91 @@ Status LayerNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
}

Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
ORT_UNUSED_PARAMETER(do_op_validation);

const auto& inputs = node_unit.Inputs();
const auto input_count = inputs.size();
constexpr size_t X_IDX = 0;
constexpr size_t SCALE_IDX = 1;
constexpr size_t BIAS_IDX = 2;

// Input[0] (X, required)
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[X_IDX], logger, input_names));

// Input[1] (scale, required)
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[SCALE_IDX], logger, input_names));

// Input[2] (bias, optional)
const bool has_bias_input = input_count > BIAS_IDX && inputs[BIAS_IDX].node_arg.Exists();
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names));
}

#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR == 17
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
// Bias is implicit. QNN SDK 2.24 (QNN API version 2.17) has a validation bug for implicit bias inputs, so provide
// an explicit bias of all 0 (quantized int32).
TensorInfo x_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info));

TensorInfo scale_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[SCALE_IDX], scale_input_info));

if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";

// Make dummy bias input have the same shape as the scale input.
std::vector<uint32_t> bias_shape = scale_input_info.shape;
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale should be the product of the other inputs' quantization scales.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (scale_input_info.quant_param.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
}
}
#endif

return Status::OK();
}

Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down
106 changes: 105 additions & 1 deletion onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,117 @@
return *this;
}

// Construct per-tensor quantization params.
QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) {
params_.encodingDefinition = QNN_DEFINITION_DEFINED;
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
params_.scaleOffsetEncoding.scale = scale;
params_.scaleOffsetEncoding.offset = offset;
}

// Construct a per-channel quantization param.
QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets,
int32_t axis, bool is_int4) {
assert(scales.size() == offsets.size()); // Logic error if sizes don't match.
const uint32_t num_elems = static_cast<uint32_t>(scales.size());
params_.encodingDefinition = QNN_DEFINITION_DEFINED;

if (is_int4) {
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
params_.bwAxisScaleOffsetEncoding.numElements = num_elems;
params_.bwAxisScaleOffsetEncoding.axis = axis;
params_.bwAxisScaleOffsetEncoding.bitwidth = 4;

// Deep copy to the scales[] and offsets[] arrays
if (num_elems > 0) {
const size_t num_scale_bytes = num_elems * sizeof(float);
const size_t num_zp_bytes = num_elems * sizeof(int32_t);
const size_t num_bytes = num_scale_bytes + num_zp_bytes;
constexpr std::uintptr_t align = alignof(float);
static_assert(alignof(float) == alignof(int32_t));

per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
char* scales_begin = ALIGN_PTR_UP(per_channel_data_.get(), align, char*);
char* zps_begin = scales_begin + num_scale_bytes;

std::memcpy(scales_begin, scales.data(), num_scale_bytes);
std::memcpy(zps_begin, offsets.data(), num_zp_bytes);
params_.bwAxisScaleOffsetEncoding.scales = reinterpret_cast<float*>(scales_begin);
params_.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast<int32_t*>(zps_begin);
} else {
params_.bwAxisScaleOffsetEncoding.scales = nullptr;
params_.bwAxisScaleOffsetEncoding.offsets = nullptr;
}
} else {
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
params_.axisScaleOffsetEncoding.numScaleOffsets = num_elems;
params_.axisScaleOffsetEncoding.axis = axis;

// Deep copy to the scaleOffset data.
if (num_elems > 0) {
const size_t num_bytes = num_elems * sizeof(Qnn_ScaleOffset_t);
constexpr std::uintptr_t align = alignof(Qnn_ScaleOffset_t);
per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
Qnn_ScaleOffset_t* aligned_dst = ALIGN_PTR_UP(per_channel_data_.get(), align, Qnn_ScaleOffset_t*);

for (size_t i = 0; i < static_cast<uint32_t>(num_elems); i++) {
aligned_dst[i].offset = offsets[i];
aligned_dst[i].scale = scales[i];
}

params_.axisScaleOffsetEncoding.scaleOffset = aligned_dst;
} else {
params_.axisScaleOffsetEncoding.scaleOffset = nullptr;
}
}
}

// Get a copy of scales. Works for both per-tensor and per-channel.
Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector<float>& scales) const {
ORT_RETURN_IF_NOT(params_.encodingDefinition == QNN_DEFINITION_DEFINED, "Unquantized qparams does not have scales");

switch (params_.quantizationEncoding) {
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET:
scales.resize(1);
scales[0] = params_.scaleOffsetEncoding.scale;
break;
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET:
scales.resize(1);
scales[0] = params_.bwScaleOffsetEncoding.scale;
break;
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
const uint32_t num_elems = params_.axisScaleOffsetEncoding.numScaleOffsets;
scales.resize(num_elems);

if (num_elems > 0) {
gsl::span<const Qnn_ScaleOffset_t> scale_offsets(params_.axisScaleOffsetEncoding.scaleOffset, num_elems);

for (size_t i = 0; i < num_elems; i++) {
scales[i] = scale_offsets[i].scale;
}
}
break;
}
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
const uint32_t num_elems = params_.bwAxisScaleOffsetEncoding.numElements;
scales.resize(num_elems);

// Deep copy the scales[] and offsets[] arrays
if (num_elems > 0) {
gsl::span<const float> src_scales(params_.bwAxisScaleOffsetEncoding.scales, num_elems);
for (size_t i = 0; i < num_elems; i++) {
scales[i] = src_scales[i];
}
}
break;
}
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ", params_.quantizationEncoding);

Check warning on line 138 in onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc:138: Lines should be <= 120 characters long [whitespace/line_length] [2]
}

return Status::OK();
}

QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const {
return QnnQuantParamsWrapper(*this);
}
Expand Down Expand Up @@ -199,7 +303,7 @@

params_.encodingDefinition = QNN_DEFINITION_DEFINED;
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(*(ort_quant_params->axis));
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
adrianlizarraga marked this conversation as resolved.
Show resolved Hide resolved
params_.bwAxisScaleOffsetEncoding.bitwidth = 4;
params_.bwAxisScaleOffsetEncoding.numElements = static_cast<uint32_t>(num_elems);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once
#include <memory>
#include <vector>
#include "QnnTypes.h"
#include "core/common/common.h"
#include <gsl/gsl>
Expand All @@ -26,6 +27,9 @@ class QnnQuantParamsWrapper {
// Construct a per-tensor quantization param (SCALE_OFFSET)
QnnQuantParamsWrapper(float scale, int32_t offset);

// Construct a per-channel quantization param.
QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets, int32_t axis, bool is_int4);

Qnn_QuantizeParams_t& Get() { return params_; }
const Qnn_QuantizeParams_t& Get() const { return params_; }

Expand Down Expand Up @@ -54,6 +58,9 @@ class QnnQuantParamsWrapper {
(params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET));
}

// Get a copy of scales. Works for both per-tensor and per-channel.
Status GetScales(/*out*/ std::vector<float>& scales) const;

// Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
// must be transposed using the inverse permutation of the Transpose.
template <typename IntType>
Expand Down
Loading
Loading