diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp index c99ec8b69a3..adb22b1d79b 100644 --- a/compiler/circle2circle/src/Circle2Circle.cpp +++ b/compiler/circle2circle/src/Circle2Circle.cpp @@ -193,6 +193,8 @@ int entry(int argc, char **argv) "This will convert single input Transpose to Reshape"); add_switch(arser, "--expand_broadcast_const", "This will expand broadcastable constant inputs"); add_switch(arser, "--unroll_unidirseqlstm", "Unroll UnidirectionalSequenceLSTM operator."); + add_switch(arser, "--compress_weights_huffman", + "Loseless weights compression with Huffman encoding."); add_switch(arser, "--convert_nchw_to_nhwc", "Experimental: This will convert NCHW operators to NHWC under the assumption that " "input model is NCHW."); @@ -343,6 +345,7 @@ int entry(int argc, char **argv) option_str_to_enum["decompose_softmax"] = Algorithms::DecomposeSoftmaxPass; option_str_to_enum["expand_broadcast_const"] = Algorithms::ExpandBroadcastConst; option_str_to_enum["unroll_unidirseqlstm"] = Algorithms::UnrollUnidirSeqLSTM; + option_str_to_enum["compress_weights_huffman"] = Algorithms::CompressWeightsHuffman; // clang-format on if (arser.get("--verbose")) diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h index f118ee22c24..91ca85380c4 100644 --- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h +++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h @@ -18,6 +18,7 @@ #define LUCI_INTERPRETER_CORE_TENSOR_H #include "luci_interpreter/core/DataType.h" +#include #include #include @@ -146,6 +147,8 @@ class Tensor void resize(const Shape &new_shape); + void resize(const Shape &new_shape, size_t raw_size); + void set_data_buffer(uint8_t *buffer) { if (buffer == nullptr) @@ -173,11 +176,21 @@ class Tensor void set_offset(int32_t offset) { _offset = offset; } + luci::CompressionType get_compression() const { return _compression; } + + void set_compression(luci::CompressionType compression) { _compression = compression; } + + size_t get_raw_size(void) const { return _raw_size; } + void set_raw_size(size_t size) { _raw_size = size; } + private: DataType _element_type; Shape _shape; AffineQuantization _quantization; uint8_t *_data = nullptr; + // Used for compressed/sparsed tensors when size != WxHxLxD + size_t _raw_size{0}; + std::string _name; bool _data_allocated = false; // Write of tensor is reported to registered Observers only if this tensor is observable @@ -190,6 +203,8 @@ class Tensor // Used by static memory manager. // Stores the offset from the beginning of the allocated memory buffer. int32_t _offset = -1; + + luci::CompressionType _compression{luci::CompressionType::NONE}; }; } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h b/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h new file mode 100644 index 00000000000..6a8dd712b7c --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__ +#define __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace luci_interpreter_pal +{ + +namespace huffman +{ +template struct Node +{ + Node *p_left = nullptr; + Node *p_right = nullptr; + T data; + unsigned int freq; +}; + +template class HuffmanDecoder +{ +private: + Node *root = nullptr; + std::unordered_map huffmanCode; + std::vector encoded_bitset{}; + std::size_t nodes_count = 0; + +private: + Node *allocateNode(T data, unsigned int freq, Node *p_left, Node *p_right) + { + Node *node = new Node; + node->data = data; + node->freq = freq; + node->p_left = p_left; + node->p_right = p_right; + nodes_count++; + return node; + } + + std::string exportHuffmanTreeToString(Node *node) + { + if (node == nullptr) + return ""; + if (!node->p_left && !node->p_right) + { + return "0" + std::bitset(node->data).to_string(); + } + std::string tmp = "1"; + tmp += exportHuffmanTreeToString(node->p_left); + tmp += exportHuffmanTreeToString(node->p_right); + return tmp; + } + + Node *importHuffmanTreeFromBoolVec(std::vector &vec, size_t &index) + { + if (vec.empty()) + return nullptr; + if (vec[index]) + { + index++; + Node *p_left = importHuffmanTreeFromBoolVec(vec, index); + Node *p_right = importHuffmanTreeFromBoolVec(vec, index); + return allocateNode(0, 0, p_left, p_right); + } + else if (vec[index] == false) + { + index++; + T tmp = 0; + for (size_t i = 0; i < sizeof(T) * CHAR_BIT; ++i) + { + if (vec[index++]) + tmp |= (1 << (sizeof(T) * CHAR_BIT - 1)) >> i; + } + + return allocateNode(tmp, 0, nullptr, nullptr); + } + return nullptr; + } + + Node *importHuffmanTreeFromString(std::string &str) + { + + if (str.substr(0, 1) == "1") + { + str = str.substr(1); + Node *p_left = importHuffmanTreeFromString(str); + Node *p_right = importHuffmanTreeFromString(str); + return allocateNode(0, 0, p_left, p_right); + } + else if (str.substr(0, 1) == "0") + { + str = str.substr(1); + std::bitset tmp(str.substr(0, sizeof(T) * CHAR_BIT)); + str = str.substr(sizeof(T) * CHAR_BIT); + return allocateNode(static_cast(tmp.to_ullong()), 0, nullptr, nullptr); + } + } + + void buildHuffmanTable(Node *node, const std::string str = "") + { + if (node == nullptr) + return; + + if (!node->p_left && !node->p_right) + { + huffmanCode[node->data] = str; + } + + buildHuffmanTable(node->p_left, str + "0"); + buildHuffmanTable(node->p_right, str + "1"); + } + + void decode(Node *node, std::string &str, std::vector &out_vec, size_t &index) + { + if (node == nullptr) + { + return; + } + + if (!node->p_left && !node->p_right) + { + out_vec.push_back(node->data); + return; + } + + if (str.size() == index) + return; + if (str[index] == '0') + { + decode(node->p_left, str, out_vec, ++index); + } + else + { + decode(node->p_right, str, out_vec, ++index); + } + } + + struct EncodedTreeAndData + { + std::vector tree_vec{}; + std::vector data_vec{}; + }; + + EncodedTreeAndData unpackArrayToEncodedTreeAndData(const uint8_t *pack_ptr) + { + constexpr auto kTreeSizeBytesN = sizeof(size_t); + constexpr auto kDataSizeBytesN = sizeof(size_t); + + const std::bitset tree_size_bitset( + *static_cast(static_cast(pack_ptr))); + const std::bitset data_size_bitset( + *static_cast(static_cast(pack_ptr + kTreeSizeBytesN))); + + const size_t kTreeSizeInBits = static_cast(tree_size_bitset.to_ullong()); + const size_t kDataSizeInBits = static_cast(data_size_bitset.to_ullong()); + + auto start_pos = kTreeSizeBytesN + kDataSizeBytesN; + EncodedTreeAndData tree_and_data; + + const auto kTreeSizeInBytes = + kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT; + + for (size_t i = 0; i < kTreeSizeInBytes; ++i) + { + const auto kNumOfBits = + kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT; + for (size_t j = 0; j < kNumOfBits; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.tree_vec.push_back(true); + else + tree_and_data.tree_vec.push_back(false); + } + } + const auto kDataSizeInBytes = + kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT; + const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT; + start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes; + + for (size_t i = 0; i < kDataSizeInBytes; ++i) + { + const auto kNumOfBits = + kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT; + const auto kBitsInFirstByteToRead = + kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits; + for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j) + { + + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + if (kNumOfBits < CHAR_BIT - kOffsetInBits) + break; + const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits + ? kNumOfBits - (CHAR_BIT - kOffsetInBits) + : kOffsetInBits; + for (size_t j = 0; j < kBitsLeft; ++j) + { + + if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + return tree_and_data; + } + + EncodedTreeAndData unpackArrayToEncodedTreeAndData(const std::vector &packed_vec) + { + constexpr auto kTreeSizeBytesN = sizeof(size_t); + constexpr auto kDataSizeBytesN = sizeof(size_t); + const uint8_t *pack_ptr = packed_vec.data(); + const std::bitset tree_size_bitset( + *static_cast(static_cast(pack_ptr))); + const std::bitset data_size_bitset( + *static_cast(static_cast(pack_ptr + kTreeSizeBytesN))); + + const size_t kTreeSizeInBits = static_cast(tree_size_bitset.to_ullong()); + const size_t kDataSizeInBits = static_cast(data_size_bitset.to_ullong()); + + auto start_pos = kTreeSizeBytesN + kDataSizeBytesN; + EncodedTreeAndData tree_and_data; + + const auto kTreeSizeInBytes = + kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT; + + for (size_t i = 0; i < kTreeSizeInBytes; ++i) + { + const auto kNumOfBits = + kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT; + for (size_t j = 0; j < kNumOfBits; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + const auto kDataSizeInBytes = + kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT; + const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT; + start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes; + + for (size_t i = 0; i < kDataSizeInBytes; ++i) + { + const auto kNumOfBits = + kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT; + const auto kBitsInFirstByteToRead = + kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits; + for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j) + { + + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + if (kNumOfBits < CHAR_BIT - kOffsetInBits) + break; + const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits + ? kNumOfBits - (CHAR_BIT - kOffsetInBits) + : kOffsetInBits; + for (size_t j = 0; j < kBitsLeft; ++j) + { + + if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + return tree_and_data; + } + +public: + void decode(Node *node, std::vector &vec, T *dst_ptr) + { + if (node == nullptr) + { + return; + } + + if (!node->p_left && !node->p_right) + { + *dst_ptr = node->data; + return; + } + + if (vec.size() == _decode_idx) + return; + if (vec[_decode_idx] == false) + { + ++_decode_idx; + decode(node->p_left, vec, dst_ptr); + } + else + { + ++_decode_idx; + decode(node->p_right, vec, dst_ptr); + } + } + +private: + size_t _decode_idx = 0; + EncodedTreeAndData _encoded_tree_and_data; + +public: + void init_decoder(const uint8_t *input) + { + size_t index = 0; + _encoded_tree_and_data = unpackArrayToEncodedTreeAndData(input); + root = importHuffmanTreeFromBoolVec(_encoded_tree_and_data.tree_vec, index); + } + + void reset_decode_idx(void) { _decode_idx = 0; } + + int decode_n(uint8_t *dst_ptr, size_t num) + { + size_t bytes_decoded = 0; + for (size_t i = 0; i < num && _decode_idx < _encoded_tree_and_data.data_vec.size(); ++i) + { + decode(root, _encoded_tree_and_data.data_vec, dst_ptr + bytes_decoded); + bytes_decoded++; + } + return bytes_decoded; + } + + HuffmanDecoder() = default; +}; +} // namespace huffman +} // namespace luci_interpreter_pal +#endif // __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__ diff --git a/compiler/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-interpreter/pal/linux/PALConv2d.h index 0ce83fc6e35..4d0f3a37774 100644 --- a/compiler/luci-interpreter/pal/linux/PALConv2d.h +++ b/compiler/luci-interpreter/pal/linux/PALConv2d.h @@ -19,6 +19,7 @@ #include #include +#include "HuffmanDecoder.h" namespace luci_interpreter_pal { @@ -84,6 +85,135 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS scratchpad_data, gemmlowp_context.get()); } +template +void ConvPerChannelHuffman(const tflite::ConvParams ¶ms, const int32_t *mult, + const int32_t *shifts, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32 *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + const int32_t filter_offset = params.weights_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + huffman::HuffmanDecoder decoder; + decoder.init_decoder(reinterpret_cast(filter_data)); + decoder.reset_decode_idx(); + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + auto group = out_channel / filters_per_group; + + // extract compressed filter + decoder.decode_n(reinterpret_cast(&scratchpad_data[0]), scratchpad_shape.FlatSize()); + + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + int32_t acc = 0; + + for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel) + { + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + + if (!is_point_inside_image) + { + continue; + } + + int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; + int32_t filter_val = + scratchpad_data[(filter_y * filter_height + filter_x) * filter_width + + in_channel]; + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // accumulator depth is smaller than 2^16. + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = tflite::MultiplyByQuantizedMultiplier(acc, mult[out_channel], shifts[out_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast(acc); + } + } + } + } +} + static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, const int32_t *shifts, const tflite::RuntimeShape &input_shape, const int8 *input_data, const tflite::RuntimeShape &filter_shape, @@ -105,7 +235,8 @@ static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, const tflite::ConvParams ¶ms, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &filter_shape, - const tflite::RuntimeShape &output_shape) + const tflite::RuntimeShape &output_shape, + bool is_compressed = false) { const int32_t filter_height = filter_shape.Dims(1); const int32_t filter_width = filter_shape.Dims(2); @@ -117,7 +248,7 @@ static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, const bool need_non_dilated_scratchpad = params.stride_height != 1 || params.stride_width != 1 || filter_height != 1 || filter_width != 1; auto _need_scratchpad = input_data_type != luci_interpreter::DataType::S16 && - (need_dilated_scratchpad || need_non_dilated_scratchpad); + (need_dilated_scratchpad || need_non_dilated_scratchpad || is_compressed); if (_need_scratchpad) { diff --git a/compiler/luci-interpreter/src/SimpleMemoryManager.cpp b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp index a39c34a0ad8..bf13b0cc9a8 100644 --- a/compiler/luci-interpreter/src/SimpleMemoryManager.cpp +++ b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp @@ -29,12 +29,21 @@ void SimpleMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor) { release_memory(tensor); } - const auto element_size = getDataTypeSize(tensor.element_type()); + size_t bytes_to_allocate = 0; + if (tensor.get_raw_size() > 0) + { + bytes_to_allocate = tensor.get_raw_size(); + } + else + { + const auto element_size = getDataTypeSize(tensor.element_type()); - // Use large_num_elements to avoid overflow - const auto num_elements = tensor.shape().large_num_elements(); + // Use large_num_elements to avoid overflow + const auto num_elements = tensor.shape().large_num_elements(); + bytes_to_allocate = num_elements * element_size; + } - auto *data = new uint8_t[num_elements * element_size]; + auto *data = new uint8_t[bytes_to_allocate]; tensor.set_data_buffer(data); } diff --git a/compiler/luci-interpreter/src/core/Tensor.cpp b/compiler/luci-interpreter/src/core/Tensor.cpp index 3c3c5ffffe8..b7769174e23 100644 --- a/compiler/luci-interpreter/src/core/Tensor.cpp +++ b/compiler/luci-interpreter/src/core/Tensor.cpp @@ -45,14 +45,34 @@ void Tensor::writeData(const void *data_ptr, size_t data_size) { const size_t element_size = getDataTypeSize(element_type()); const int32_t num_elements = shape().num_elements(); - if (data_size != num_elements * element_size) + if (_raw_size > 0) { - throw std::invalid_argument("Invalid data size."); + if (data_size != _raw_size) + { + throw std::invalid_argument("Invalid data size."); + } + } + else + { + if (data_size != num_elements * element_size) + { + throw std::invalid_argument("Invalid data size."); + } } assert(data_ptr != nullptr); std::memcpy(data(), data_ptr, data_size); } -void Tensor::resize(const Shape &new_shape) { _shape = new_shape; } +void Tensor::resize(const Shape &new_shape) +{ + _shape = new_shape; + _raw_size = 0; +} + +void Tensor::resize(const Shape &new_shape, size_t raw_size) +{ + _shape = new_shape; + _raw_size = raw_size; +} } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp index 9aae9da2644..a5377408adc 100644 --- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp +++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp @@ -117,9 +117,10 @@ void Conv2D::configure() params.dilation_height_factor = _params.dilation_height_factor; params.dilation_width_factor = _params.dilation_width_factor; auto scratchpad = getOutputTensors()[1]; + bool is_compressed = filter()->get_compression() != luci::CompressionType::NONE; luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), params, getTensorShape(input()), getTensorShape(filter()), - getTensorShape(output())); + getTensorShape(output()), is_compressed); switch (_params.activation) { @@ -145,20 +146,34 @@ void Conv2D::execute() const } throw std::runtime_error("luci-intp Conv2D(2) Unsupported type."); case DataType::U8: - if (filter()->scales().size() == 1) + if (filter()->get_compression() == luci::CompressionType::HUFFMAN) { - evalQuantized(); + evalQuantizedU8PerChannelHuffman(); } - else if (filter()->scales().size() > 1) + else { - LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4); - LUCI_INTERPRETER_CHECK(filter()->scales().size() == - static_cast(filter()->shape().dim(0))); - evalQuantizedPerChannel(); + if (filter()->scales().size() == 1) + { + evalQuantized(); + } + else if (filter()->scales().size() > 1) + { + LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4); + LUCI_INTERPRETER_CHECK(filter()->scales().size() == + static_cast(filter()->shape().dim(0))); + evalQuantizedPerChannel(); + } } break; case DataType::S8: - evalQuantizedS8PerChannel(); + if (filter()->get_compression() == luci::CompressionType::HUFFMAN) + { + evalQuantizedS8PerChannelHuffman(); + } + else + { + evalQuantizedS8PerChannel(); + } break; case DataType::S16: evalQuantizedS16(); @@ -321,6 +336,120 @@ void Conv2D::evalQuantizedPerChannel() const } } +// TODO: remove code duplication with S8 +void Conv2D::evalQuantizedU8PerChannelHuffman() const +{ + int32_t activation_min{}; + int32_t activation_max{}; + calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max); + + tflite::ConvParams params{}; + params.padding_values.height = _padding_height; + params.padding_values.width = _padding_width; + params.stride_height = _params.stride_height; + params.stride_width = _params.stride_width; + params.dilation_height_factor = _params.dilation_height_factor; + params.dilation_width_factor = _params.dilation_width_factor; + // The kernel expects filter zero points to be negated. + params.input_offset = -input()->zero_point(); // Note the '-'. + params.weights_offset = -filter()->zero_point(); // Unused in tflite code + params.output_offset = output()->zero_point(); + params.quantized_activation_min = activation_min; + params.quantized_activation_max = activation_max; + + const std::vector effective_output_scales = + getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale()); + + std::vector quant_multipliers = + quantizeMultipliers(effective_output_scales); + + std::vector shifts; + std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts), + [](ChannelQuantMultipliers cm) { return cm.shift; }); + std::vector multipliers; + std::transform(quant_multipliers.begin(), quant_multipliers.end(), + std::back_inserter(multipliers), + [](ChannelQuantMultipliers cm) { return cm.multiplier; }); + + auto scratchpad = getOutputTensors()[1]; + uint8_t *scratchpad_data = nullptr; + + // Scratchpad used for decompression + const auto filter_shape = getTensorShape(filter()); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + auto scratchpad_shape = Shape({filter_height, filter_width, filter_input_depth}); + + if (scratchpad->is_allocatable()) + { + scratchpad->resize(scratchpad_shape); + scratchpad_data = scratchpad->data(); + } + luci_interpreter_pal::ConvPerChannelHuffman( + params, multipliers.data(), shifts.data(), getTensorShape(input()), + getTensorData(input()), getTensorShape(filter()), getTensorData(filter()), + getTensorShape(bias()), getTensorData(bias()), getTensorShape(output()), + getTensorData(output()), getTensorShape(scratchpad), scratchpad_data); +} + +void Conv2D::evalQuantizedS8PerChannelHuffman() const +{ + int32_t activation_min{}; + int32_t activation_max{}; + calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max); + + tflite::ConvParams params{}; + params.padding_values.height = _padding_height; + params.padding_values.width = _padding_width; + params.stride_height = _params.stride_height; + params.stride_width = _params.stride_width; + params.dilation_height_factor = _params.dilation_height_factor; + params.dilation_width_factor = _params.dilation_width_factor; + // The kernel expects filter zero points to be negated. + params.input_offset = -input()->zero_point(); // Note the '-'. + params.weights_offset = 0; // Unused in tflite code + params.output_offset = output()->zero_point(); + params.quantized_activation_min = activation_min; + params.quantized_activation_max = activation_max; + + const std::vector effective_output_scales = + getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale()); + + std::vector quant_multipliers = + quantizeMultipliers(effective_output_scales); + + std::vector shifts; + std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts), + [](ChannelQuantMultipliers cm) { return cm.shift; }); + std::vector multipliers; + std::transform(quant_multipliers.begin(), quant_multipliers.end(), + std::back_inserter(multipliers), + [](ChannelQuantMultipliers cm) { return cm.multiplier; }); + + auto scratchpad = getOutputTensors()[1]; + int8_t *scratchpad_data = nullptr; + + // Scratchpad used for decompression + const auto filter_shape = getTensorShape(filter()); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + auto scratchpad_shape = Shape({filter_height, filter_width, filter_input_depth}); + + if (scratchpad->is_allocatable()) + { + scratchpad->resize(scratchpad_shape); + scratchpad_data = scratchpad->data(); + } + + luci_interpreter_pal::ConvPerChannelHuffman( + params, multipliers.data(), shifts.data(), getTensorShape(input()), + getTensorData(input()), getTensorShape(filter()), getTensorData(filter()), + getTensorShape(bias()), getTensorData(bias()), getTensorShape(output()), + getTensorData(output()), getTensorShape(scratchpad), scratchpad_data); +} + void Conv2D::evalQuantizedS8PerChannel() const { int32_t activation_min{}; diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-interpreter/src/kernels/Conv2D.h index 330bf3a2a69..096bd85f4db 100644 --- a/compiler/luci-interpreter/src/kernels/Conv2D.h +++ b/compiler/luci-interpreter/src/kernels/Conv2D.h @@ -47,6 +47,8 @@ class Conv2D : public KernelWithParams void evalQuantizedPerChannel() const; void evalQuantizedS8PerChannel() const; void evalQuantizedS16() const; + void evalQuantizedS8PerChannelHuffman() const; + void evalQuantizedU8PerChannelHuffman() const; private: int32_t _padding_height{}; diff --git a/compiler/luci-interpreter/src/kernels/Utils.h b/compiler/luci-interpreter/src/kernels/Utils.h index e975585cdf3..422c0b4d7d8 100644 --- a/compiler/luci-interpreter/src/kernels/Utils.h +++ b/compiler/luci-interpreter/src/kernels/Utils.h @@ -137,7 +137,8 @@ Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_ inline double getQuantizedConvolutionMultipler(float input_scale, float filter_scale, float output_scale) { - const double input_product_scale = static_cast(input_scale * filter_scale); + const double input_product_scale = + static_cast(static_cast(input_scale) * static_cast(filter_scale)); LUCI_INTERPRETER_CHECK(input_product_scale >= 0); return input_product_scale / static_cast(output_scale); } diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp index cf83713d906..6e1399dd467 100644 --- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp +++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp @@ -243,9 +243,11 @@ void GraphLoader::loadTensors() const void *const_data = getNodeData(const_node, &data_size); if (const_data != nullptr) { + tensor->set_raw_size(data_size); _memory_manager->allocate_memory(*tensor); tensor->writeData(const_data, data_size); } + tensor->set_compression(const_node->compression()); } else if (const auto *custom_out_node = dynamic_cast(node)) { @@ -258,6 +260,7 @@ void GraphLoader::loadTensors() const void *const_data = getNodeData(custom_node, &data_size); if (const_data != nullptr) { + tensor->set_raw_size(data_size); _memory_manager->allocate_memory(*tensor); tensor->writeData(const_data, data_size); } diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst index 8328948b937..d2ad2d41742 100644 --- a/compiler/luci-pass-value-py-test/test.lst +++ b/compiler/luci-pass-value-py-test/test.lst @@ -7,6 +7,7 @@ # eval(Net_Preactivation_BN_000 fuse_preactivation_batchnorm) : value diff exist # --> https://github.com/Samsung/ONE/issues/5782 +eval(Conv2D_U8_000 compress_weights_huffman) eval(FullyConnected_007 replace_non_const_fc_with_batch_matmul) eval(HardSwish_001 decompose_hardswish) eval(Net_Add_FloorMod_Gather_000 remove_gather_guard) diff --git a/compiler/luci/export/src/CircleExporterUtils.cpp b/compiler/luci/export/src/CircleExporterUtils.cpp index f6e380d7872..13889f17f89 100644 --- a/compiler/luci/export/src/CircleExporterUtils.cpp +++ b/compiler/luci/export/src/CircleExporterUtils.cpp @@ -25,6 +25,21 @@ namespace luci { +circle::CompressionType to_circle_compressiontype(luci::CompressionType type) +{ + switch (type) + { + case luci::CompressionType::UNDEFINED: + case luci::CompressionType::NONE: + return circle::CompressionType_NONE; + case luci::CompressionType::HUFFMAN: + return circle::CompressionType_HUFFMAN; + default: + INTERNAL_EXN_V("trying to convert unsupported luci::WeightCompression", + oops::to_uint32(type)); + } +} + circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func) { switch (func) diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h index 6d0ebd6cb29..970c7555a3f 100644 --- a/compiler/luci/export/src/CircleExporterUtils.h +++ b/compiler/luci/export/src/CircleExporterUtils.h @@ -32,6 +32,7 @@ inline constexpr uint64_t FLATBUFFERS_SIZE_MAX = 2147483648UL; // 2GB namespace luci { +circle::CompressionType to_circle_compressiontype(luci::CompressionType type); circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func); circle::TensorType to_circle_tensortype(loco::DataType type); circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode); diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp index fa585d06e81..8d8ab851c74 100644 --- a/compiler/luci/export/src/CircleTensorExporter.cpp +++ b/compiler/luci/export/src/CircleTensorExporter.cpp @@ -578,30 +578,48 @@ bool has_same_values(luci::CircleConst *lhs, luci::CircleConst *rhs) switch (lhs->dtype()) { case loco::DataType::FLOAT32: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::S4: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::S8: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::S16: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::S32: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::S64: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::U4: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::U8: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); case loco::DataType::BOOL: + if (lhs->size() != rhs->size()) + return false; return has_same_elements(lhs, rhs); default: @@ -668,8 +686,14 @@ void exportOpDefinedTensor(const CircleTensorInfo &info, FlatBufferBuilder &buil auto is_variable = info.is_variable(); - auto tensor_offset = CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset, - quantparam, is_variable, sparsityparam, shape_signature_offset); + luci::CircleConst *content = info.content(); + auto compression_type = circle::CompressionType_NONE; + if (content) + compression_type = to_circle_compressiontype(info.content()->compression()); + + auto tensor_offset = + CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset, quantparam, + is_variable, sparsityparam, shape_signature_offset, false, 0, compression_type); gd._tensors.push_back(tensor_offset); } diff --git a/compiler/luci/import/include/luci/Import/CircleImporterUtils.h b/compiler/luci/import/include/luci/Import/CircleImporterUtils.h new file mode 100644 index 00000000000..f96ec210747 --- /dev/null +++ b/compiler/luci/import/include/luci/Import/CircleImporterUtils.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CIRCLE_IMPORTER_UTILS_H__ +#define __CIRCLE_IMPORTER_UTILS_H__ + +#include + +#include + +#include + +namespace luci +{ + +luci::CompressionType from_circle_compressiontype(circle::CompressionType type); + +} // namespace luci + +#endif // __CIRCLE_IMPORTER_UTILS_H__ diff --git a/compiler/luci/import/src/CircleImporterUtils.cpp b/compiler/luci/import/src/CircleImporterUtils.cpp new file mode 100644 index 00000000000..2e4f97ef27d --- /dev/null +++ b/compiler/luci/import/src/CircleImporterUtils.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Import/CircleImporterUtils.h" + +#include + +namespace luci +{ + +luci::CompressionType from_circle_compressiontype(circle::CompressionType type) +{ + switch (type) + { + case circle::CompressionType_NONE: + return luci::CompressionType::NONE; + case circle::CompressionType_HUFFMAN: + return luci::CompressionType::HUFFMAN; + default: + INTERNAL_EXN_V("trying to convert unsupported luci::WeightCompression", + oops::to_uint32(type)); + } +} + +} // namespace luci diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp index 392f0c2a5b9..ccf3e0f7e5b 100644 --- a/compiler/luci/import/src/CircleReader.cpp +++ b/compiler/luci/import/src/CircleReader.cpp @@ -15,6 +15,7 @@ */ #include "luci/Import/CircleReader.h" +#include #include @@ -289,6 +290,11 @@ void copy_tensor_attributes(const circle::Tensor *tensor, CircleNode *node) if (sparsityparam) node->sparsityparam(std::move(sparsityparam)); } + auto const_node = dynamic_cast(node); + if (const_node) + { + const_node->compression(luci::from_circle_compressiontype(tensor->compression_type())); + } } std::string fb_string2std_string(const flatbuffers::String *fb_str) diff --git a/compiler/luci/import/src/Nodes/CircleConst.cpp b/compiler/luci/import/src/Nodes/CircleConst.cpp index 945a8dc98b7..83639ae2d59 100644 --- a/compiler/luci/import/src/Nodes/CircleConst.cpp +++ b/compiler/luci/import/src/Nodes/CircleConst.cpp @@ -57,7 +57,7 @@ void copy_data(const VectorWrapper &raw_data, uint32_t num_elements, using T = typename loco::DataTypeImpl
::Type; // TODO calculate the exact buffer size of sparse tensor - if (const_node->sparsityparam()) + if (const_node->sparsityparam() or const_node->compression() != luci::CompressionType::NONE) { num_elements = raw_data.size() / sizeof(T); } diff --git a/compiler/luci/import/src/Nodes/CircleConv2D.cpp b/compiler/luci/import/src/Nodes/CircleConv2D.cpp index 8cbecdc003b..35b59e48b4a 100644 --- a/compiler/luci/import/src/Nodes/CircleConv2D.cpp +++ b/compiler/luci/import/src/Nodes/CircleConv2D.cpp @@ -15,6 +15,7 @@ */ #include "luci/Import/Nodes/CircleConv2D.h" +#include "luci/Import/CircleImporterUtils.h" #include diff --git a/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h b/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h new file mode 100644 index 00000000000..e1a83b01908 --- /dev/null +++ b/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__ +#define __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__ + +namespace luci +{ + +enum class CompressionType +{ + UNDEFINED, // This is not defined by TFLite or Circle. This was added to + // prevent programming error. + NONE, + HUFFMAN +}; + +} // namespace luci + +#endif // __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__ diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h index 3e9a274e0cd..2f59b73b3fa 100644 --- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h +++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h @@ -17,6 +17,7 @@ #ifndef __LUCI_IR_CIRCLECONST_H__ #define __LUCI_IR_CIRCLECONST_H__ +#include "luci/IR/AttrWeightCompression.h" #include "luci/IR/CircleNodeDecl.h" #include "luci/IR/CircleOpcode.h" @@ -42,10 +43,14 @@ class CircleConst final : public FixedArityNode<0, CircleNodeImpl const typename loco::DataTypeImpl
::Type &scalar(void) const; template typename loco::DataTypeImpl
::Type &scalar(void); + CompressionType compression(void) const; + void compression(CompressionType c); + private: std::vector _data; // TODO use _data for STRING and remove _strings std::vector _strings; // for STRING type + CompressionType _compression{CompressionType::NONE}; }; } // namespace luci diff --git a/compiler/luci/lang/src/Nodes/CircleConst.cpp b/compiler/luci/lang/src/Nodes/CircleConst.cpp index c17a4e2c36d..54f23fbf175 100644 --- a/compiler/luci/lang/src/Nodes/CircleConst.cpp +++ b/compiler/luci/lang/src/Nodes/CircleConst.cpp @@ -21,6 +21,10 @@ namespace luci { +CompressionType CircleConst::compression(void) const { return _compression; } + +void CircleConst::compression(luci::CompressionType c) { _compression = c; } + template uint32_t CircleConst::size(void) const { assert(dtype() == DT); diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h index d4f675f36fe..8c90230252c 100644 --- a/compiler/luci/pass/include/luci/CircleOptimizer.h +++ b/compiler/luci/pass/include/luci/CircleOptimizer.h @@ -114,6 +114,7 @@ class CircleOptimizer final UnrollUnidirSeqLSTM, XpSepActFromTransposeConv, RemoveGatherGuard, + CompressWeightsHuffman }; enum AlgorithmParameters diff --git a/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h b/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h new file mode 100644 index 00000000000..f9f97791914 --- /dev/null +++ b/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_COMPRESS_WEIGHTS_PASS_H__ +#define __LUCI_COMPRESS_WEIGHTS_PASS_H__ + +#include + +namespace luci +{ + +/** + * @brief Class to generate FC/CONV with compressed weights + * + * To see the target Op pattern, please visit implementation. + */ +struct CompressWeightsPass final : public logo::Pass +{ + const char *name(void) const final { return "luci::CompressWeightsPass"; } + + bool run(loco::Graph *g) final; +}; + +} // namespace luci + +#endif // __LUCI_COMPRESS_WEIGHTS_PASS_H__ diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp index bf18b973d6d..9884d18de65 100644 --- a/compiler/luci/pass/src/CircleOptimizer.cpp +++ b/compiler/luci/pass/src/CircleOptimizer.cpp @@ -98,6 +98,7 @@ #include "luci/Pass/DecomposeSoftmaxPass.h" #include "luci/Pass/UnrollUnidirectionalSequenceLSTMPass.h" #include "luci/Pass/XpSepActFromTransposeConvPass.h" +#include "luci/Pass/CompressWeightsPass.h" // TODO add more passes #include "luci/Pass/CircleShapeInferencePass.h" @@ -313,7 +314,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const { phase.emplace_back(std::make_unique()); } - + if (_options->query(Options::Algorithm::CompressWeightsHuffman)) + { + phase.emplace_back(std::make_unique()); + } // clang-format off std::map (*)(void)> option_to_pass; @@ -389,7 +393,7 @@ void CircleOptimizer::optimize(loco::Graph *g) const option_to_pass[Options::Algorithm::XpSepActFromTransposeConv] = &createPassInstance; option_to_pass[Options::Algorithm::ForwardReshapeToUnaryOp] = &createPassInstance; option_to_pass[Options::Algorithm::ForwardTransposeOp] = &createPassInstance; - // clang-format on + // clang-format on for (auto const &m : option_to_pass) { diff --git a/compiler/luci/pass/src/CompressWeightsPass.cpp b/compiler/luci/pass/src/CompressWeightsPass.cpp new file mode 100644 index 00000000000..ba5204e2f6c --- /dev/null +++ b/compiler/luci/pass/src/CompressWeightsPass.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/CompressWeightsPass.h" +#include "helpers/HuffmanEncoder.h" +#include "helpers/NodeFiller.h" + +#include +#include + +#include +#include + +namespace +{ + +template class TypeSelector; + +template <> class TypeSelector +{ +public: + using Type = uint8_t; +}; +template <> class TypeSelector +{ +public: + using Type = int8_t; +}; + +template bool compress_weights_huffman(luci::CircleConv2D *conv2d) +{ + using T = typename TypeSelector
::Type; + assert(conv2d); + + auto weights = loco::must_cast(conv2d->filter()); + if (weights->compression() != luci::CompressionType::NONE) + return false; + + luci::huffman::HuffmanEncoder encoder; + auto new_weights = luci::clone(weights); + + std::vector tmp_buf(weights->size
()); + + for (size_t i = 0; i < weights->size
(); ++i) + { + tmp_buf[i] = weights->at
(i); + } + + std::vector encoded = encoder.encode(tmp_buf); + + new_weights->dtype(DT); + new_weights->size
(encoded.size()); + new_weights->compression(luci::CompressionType::HUFFMAN); + + for (size_t i = 0; i < new_weights->size
(); ++i) + { + new_weights->at
(i) = encoded[i]; + } + conv2d->filter(new_weights); + + return true; +} + +} // namespace + +namespace luci +{ + +bool CompressWeightsPass::run(loco::Graph *g) +{ + bool changed = false; + + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + auto conv2d = dynamic_cast(node); + if (not conv2d) + continue; + + auto filter = loco::must_cast(conv2d->filter()); + + if (filter->dtype() == loco::DataType::S8) + { + if (compress_weights_huffman(conv2d)) + changed = true; + } + else if (filter->dtype() == loco::DataType::U8) + { + if (compress_weights_huffman(conv2d)) + changed = true; + } + } + + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/helpers/HuffmanDecoder.h b/compiler/luci/pass/src/helpers/HuffmanDecoder.h new file mode 100644 index 00000000000..cd9f0256c73 --- /dev/null +++ b/compiler/luci/pass/src/helpers/HuffmanDecoder.h @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__ +#define __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace luci +{ + +namespace huffman +{ + +template struct Node +{ + Node *p_left = nullptr; + Node *p_right = nullptr; + T data; +}; + +template class HuffmanDecoder +{ +private: + Node *root = nullptr; + std::unordered_map huffmanCode; + std::vector encoded_bitset{}; + std::size_t nodes_count = 0; + +private: + Node *allocateNode(T data, unsigned int freq, Node *p_left, Node *p_right) + { + Node *node = new Node; + node->data = data; + node->freq = freq; + node->p_left = p_left; + node->p_right = p_right; + nodes_count++; + return node; + } + + std::string exportHuffmanTreeToString(Node *node) + { + if (node == nullptr) + return ""; + if (!node->p_left && !node->p_right) + { + return "0" + std::bitset(node->data).to_string(); + } + std::string tmp = "1"; + tmp += exportHuffmanTreeToString(node->p_left); + tmp += exportHuffmanTreeToString(node->p_right); + return tmp; + } + + Node *importHuffmanTreeFromBoolVec(std::vector &vec, size_t &index) + { + if (vec.empty()) + return nullptr; + if (vec[index]) + { + index++; + Node *p_left = importHuffmanTreeFromBoolVec(vec, index); + Node *p_right = importHuffmanTreeFromBoolVec(vec, index); + return allocateNode(0, 0, p_left, p_right); + } + else if (vec[index] == false) + { + index++; + T tmp = 0; + for (size_t i = 0; i < sizeof(T) * CHAR_BIT; ++i) + { + if (vec[index++]) + tmp |= (1 << (sizeof(T) * CHAR_BIT - 1)) >> i; + } + + return allocateNode(tmp, 0, nullptr, nullptr); + } + return nullptr; + } + + Node *importHuffmanTreeFromString(std::string &str) + { + + if (str.substr(0, 1) == "1") + { + str = str.substr(1); + Node *p_left = importHuffmanTreeFromString(str); + Node *p_right = importHuffmanTreeFromString(str); + return allocateNode(0, 0, p_left, p_right); + } + else if (str.substr(0, 1) == "0") + { + str = str.substr(1); + std::bitset tmp(str.substr(0, sizeof(T) * CHAR_BIT)); + str = str.substr(sizeof(T) * CHAR_BIT); + return allocateNode(static_cast(tmp.to_ullong()), 0, nullptr, nullptr); + } + } + + void buildHuffmanTable(Node *node, const std::string str = "") + { + if (node == nullptr) + return; + + if (!node->p_left && !node->p_right) + { + huffmanCode[node->data] = str; + } + + buildHuffmanTable(node->p_left, str + "0"); + buildHuffmanTable(node->p_right, str + "1"); + } + + void decode(Node *node, std::string &str, std::vector &out_vec, size_t &index) + { + if (node == nullptr) + { + return; + } + + if (!node->p_left && !node->p_right) + { + out_vec.push_back(node->data); + return; + } + + if (str.size() == index) + return; + if (str[index] == '0') + { + decode(node->p_left, str, out_vec, ++index); + } + else + { + decode(node->p_right, str, out_vec, ++index); + } + } + + struct EncodedTreeAndData + { + std::vector tree_vec{}; + std::vector data_vec{}; + }; + + EncodedTreeAndData unpackArrayToEncodedTreeAndData(const uint8_t *pack_ptr) + { + constexpr auto kTreeSizeBytesN = sizeof(size_t); + constexpr auto kDataSizeBytesN = sizeof(size_t); + + const std::bitset tree_size_bitset( + *static_cast(static_cast(pack_ptr))); + const std::bitset data_size_bitset( + *static_cast(static_cast(pack_ptr + kTreeSizeBytesN))); + + const size_t kTreeSizeInBits = static_cast(tree_size_bitset.to_ullong()); + const size_t kDataSizeInBits = static_cast(data_size_bitset.to_ullong()); + + auto start_pos = kTreeSizeBytesN + kDataSizeBytesN; + EncodedTreeAndData tree_and_data; + + const auto kTreeSizeInBytes = + kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT; + + for (size_t i = 0; i < kTreeSizeInBytes; ++i) + { + const auto kNumOfBits = + kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT; + for (size_t j = 0; j < kNumOfBits; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.tree_vec.push_back(true); + else + tree_and_data.tree_vec.push_back(false); + } + } + const auto kDataSizeInBytes = + kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT; + const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT; + start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes; + + for (size_t i = 0; i < kDataSizeInBytes; ++i) + { + const auto kNumOfBits = + kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT; + const auto kBitsInFirstByteToRead = + kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits; + for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + if (kNumOfBits < CHAR_BIT - kOffsetInBits) + break; + const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits + ? kNumOfBits - (CHAR_BIT - kOffsetInBits) + : kOffsetInBits; + for (size_t j = 0; j < kBitsLeft; ++j) + { + if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + return tree_and_data; + } + + EncodedTreeAndData unpackArrayToEncodedTreeAndData(const std::vector &packed_vec) + { + constexpr auto kTreeSizeBytesN = sizeof(size_t); + constexpr auto kDataSizeBytesN = sizeof(size_t); + const uint8_t *pack_ptr = packed_vec.data(); + const std::bitset tree_size_bitset( + *static_cast(static_cast(pack_ptr))); + const std::bitset data_size_bitset( + *static_cast(static_cast(pack_ptr + kTreeSizeBytesN))); + + const size_t kTreeSizeInBits = static_cast(tree_size_bitset.to_ullong()); + const size_t kDataSizeInBits = static_cast(data_size_bitset.to_ullong()); + + auto start_pos = kTreeSizeBytesN + kDataSizeBytesN; + EncodedTreeAndData tree_and_data; + + const auto kTreeSizeInBytes = + kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT; + + for (size_t i = 0; i < kTreeSizeInBytes; ++i) + { + const auto kNumOfBits = + kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT; + for (size_t j = 0; j < kNumOfBits; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + const auto kDataSizeInBytes = + kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT; + const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT; + start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes; + + for (size_t i = 0; i < kDataSizeInBytes; ++i) + { + const auto kNumOfBits = + kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT; + const auto kBitsInFirstByteToRead = + kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits; + for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j) + { + if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + if (kNumOfBits < CHAR_BIT - kOffsetInBits) + break; + const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits + ? kNumOfBits - (CHAR_BIT - kOffsetInBits) + : kOffsetInBits; + for (size_t j = 0; j < kBitsLeft; ++j) + { + if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j)) + tree_and_data.data_vec.push_back(true); + else + tree_and_data.data_vec.push_back(false); + } + } + return tree_and_data; + } + +public: + void decode(Node *node, std::vector &vec, T *dst_ptr) + { + if (node == nullptr) + { + return; + } + + if (!node->p_left && !node->p_right) + { + *dst_ptr = node->data; + return; + } + + if (vec.size() == _decode_idx) + return; + if (vec[_decode_idx] == false) + { + ++_decode_idx; + decode(node->p_left, vec, dst_ptr); + } + else + { + ++_decode_idx; + decode(node->p_right, vec, dst_ptr); + } + } + +private: + size_t _decode_idx = 0; + EncodedTreeAndData _encoded_tree_and_data; + +public: + void init_decoder(const uint8_t *input) + { + size_t index = 0; + _encoded_tree_and_data = unpackArrayToEncodedTreeAndData(input); + root = importHuffmanTreeFromBoolVec(_encoded_tree_and_data.tree_vec, index); + } + + void reset_decode_idx(void) { _decode_idx = 0; } + + int decode_n(uint8_t *dst_ptr, size_t num) + { + size_t bytes_decoded = 0; + for (int i = 0; i < num && _decode_idx < _encoded_tree_and_data.data_vec.size(); ++i) + { + decode(root, _encoded_tree_and_data.data_vec, dst_ptr + bytes_decoded); + bytes_decoded++; + } + return bytes_decoded; + } + + HuffmanDecoder() = default; +}; + +} // namespace huffman +} // namespace luci + +#endif // __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__ diff --git a/compiler/luci/pass/src/helpers/HuffmanEncoder.h b/compiler/luci/pass/src/helpers/HuffmanEncoder.h new file mode 100644 index 00000000000..26e8d3e9c54 --- /dev/null +++ b/compiler/luci/pass/src/helpers/HuffmanEncoder.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__ +#define __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__ + +#include +#include +#include +#include +#include +#include +#include + +namespace luci +{ +namespace huffman +{ + +// Node of prefix tree +template struct Node +{ + std::shared_ptr> p_left; + std::shared_ptr> p_right; + T data; + unsigned int freq; +}; + +// Compare functor for priority queue +template struct CompareNodes +{ + bool operator()(std::shared_ptr> l, std::shared_ptr> r) + { + return l->freq > r->freq; + } +}; + +template class HuffmanEncoder +{ +private: + std::unordered_map _huffman_table; + +private: + std::shared_ptr> allocateNode(T data, unsigned int freq, std::shared_ptr> p_left, + std::shared_ptr> p_right) + { + std::shared_ptr> node = std::make_unique>(); + node->data = data; + node->freq = freq; + node->p_left = p_left; + node->p_right = p_right; + return node; + } + + std::unordered_map calculateFrequencyMap(const std::vector &input) + { + std::unordered_map out_map; + for (auto &item : input) + out_map[item] = out_map.find(item) != out_map.end() ? out_map[item] + 1 : 1; + + return out_map; + } + + std::string exportHuffmanTreeToString(std::shared_ptr> node) + { + if (node == nullptr) + return ""; + + if (!node->p_left && !node->p_right) + { + return "0" + std::bitset(node->data).to_string(); + } + + std::string tmp = "1"; + tmp += exportHuffmanTreeToString(node->p_left); + tmp += exportHuffmanTreeToString(node->p_right); + return tmp; + } + + void buildHuffmanTable(std::shared_ptr> node, const std::string str = "") + { + if (node == nullptr) + return; + + if (!node->p_left && !node->p_right) + { + _huffman_table[node->data] = str; + } + + buildHuffmanTable(node->p_left, str + "0"); + buildHuffmanTable(node->p_right, str + "1"); + } + + std::shared_ptr> buildHuffmanTree(const std::vector &input) + { + auto freq_map = calculateFrequencyMap(input); + + std::priority_queue>, std::vector>>, + CompareNodes> + pq; + + for (auto &item : freq_map) + { + pq.push(allocateNode(item.first, item.second, nullptr, nullptr)); + } + + while (pq.size() != 1) + { + std::shared_ptr> left = pq.top(); + pq.pop(); + std::shared_ptr> right = pq.top(); + pq.pop(); + + unsigned int sum = left->freq + right->freq; + pq.push(allocateNode(0, sum, left, right)); + } + + return pq.top(); + } + + struct EncodedTreeAndData + { + std::vector tree_vec{}; + std::vector data_vec{}; + }; + + std::vector packEncodedDataToArray(const std::string &tree_str, + const std::string &encoded_data) + { + std::vector arr; + const size_t kTreeSizeInBits = tree_str.size(); + const size_t kDataSizeInBits = encoded_data.size(); + + for (size_t i = 0; i < sizeof(size_t); ++i) + { + arr.push_back( + *(static_cast(static_cast(&kTreeSizeInBits)) + i)); + } + + for (size_t i = 0; i < sizeof(size_t); ++i) + { + arr.push_back( + *(static_cast(static_cast(&kDataSizeInBits)) + i)); + } + + const auto merged_str = tree_str + encoded_data; + const size_t kMergedSizeInBits = merged_str.size(); + + const auto kMergedSizeInBytes = kMergedSizeInBits % CHAR_BIT ? kMergedSizeInBits / CHAR_BIT + 1 + : kMergedSizeInBits / CHAR_BIT; + for (size_t i = 0; i < kMergedSizeInBytes; ++i) + { + const auto kNumOfBits = + kMergedSizeInBits - i * CHAR_BIT < CHAR_BIT ? kMergedSizeInBits - i * CHAR_BIT : CHAR_BIT; + + std::string tmp_str = merged_str.substr(i * CHAR_BIT, kNumOfBits); + + for (size_t i = 0; i < CHAR_BIT - kNumOfBits; ++i) + tmp_str += "0"; + + const std::bitset tmp_bitset(tmp_str); + + arr.push_back(static_cast(tmp_bitset.to_ullong())); + } + return arr; + } + +public: + // Encodes input vector of values of type T and returns encoded vector of uint8_t + std::vector encode(const std::vector &input) + { + std::shared_ptr> root = buildHuffmanTree(input); + buildHuffmanTable(root); + + std::string exported_tree = exportHuffmanTreeToString(root); + std::string str = ""; + + for (auto &item : input) + { + str += _huffman_table[item]; + } + + std::vector raw_arr = packEncodedDataToArray(exported_tree, str); + return raw_arr; + } + +public: + HuffmanEncoder() = default; +}; + +} // namespace huffman +} // namespace luci + +#endif // __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__