From 99f03218682c56e826a8597a02cdb8381d9fb05d Mon Sep 17 00:00:00 2001 From: daquexian Date: Thu, 13 Jun 2019 11:35:28 +0800 Subject: [PATCH] Add more docs --- dabnn/bconv.h | 30 ++++++++++++++++++++++++++++++ dabnn/bitpack.h | 27 +++++++++++++++++++++++++++ docs/bconv.md | 4 ++-- docs/bconv_CN.md | 4 ++-- 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/dabnn/bconv.h b/dabnn/bconv.h index fad4800..258c43a 100644 --- a/dabnn/bconv.h +++ b/dabnn/bconv.h @@ -62,6 +62,9 @@ inline void bnn::bconv_3x3_64(const Mat &bottom_blob, const Mat &weight, inline void bnn::bconv_3x3_64_opt3(const Mat &bottom_blob, const Mat &weight, Mat &top_blob, const int pad, const int stride) { + /** + * See bconv_3x3_64_opt4 + */ static uint64_t col_buf[999999]; const size_t col_h = weight.h * weight.w; @@ -217,6 +220,9 @@ inline void bnn::bconv_3x3_64_opt3(const Mat &bottom_blob, const Mat &weight, inline void bnn::bconv_3x3_64_opt2(const Mat &bottom_blob, const Mat &weight, Mat &top_blob, const int pad, const int stride) { + /** + * See bconv_3x3_64_opt4 + */ static uint64_t col_buf[999999]; const size_t col_h = weight.h * weight.w; @@ -282,6 +288,20 @@ inline void bnn::bconv_3x3_64_opt2(const Mat &bottom_blob, const Mat &weight, inline void bnn::bconv_3x3_64_opt4(const Mat &bottom_blob, const Mat &weight, Mat &top_blob, const int pad, const int stride) { + /** + * This method performs 64-input-channel 3x3 binary conv by + * im2col + BGEMM. + * + * The reason that it outperforms other ways when channel==64 + * is the 128-bit vector registers cannot be fully filled in + * Binary Direct Convolution + NC1HWC2 memory layout if there + * are only 64 channels. + * + * By contrast, BGEMM can leverage 128-bit registers after im2col, + * and amortize the memory access. + * + */ + // TODO: A more elegant way static uint64_t col_buf[999999]; const size_t col_h = weight.h * weight.w; @@ -404,6 +424,9 @@ inline void bnn::bconv_3x3_64_opt4(const Mat &bottom_blob, const Mat &weight, inline void bnn::bconv_3x3_64_opt(const Mat &bottom_blob, const Mat &weight, Mat &top_blob) { + /** + * See bconv_3x3_64_opt4 + */ BNN_ASSERT(weight.n % 2 == 0, weight.n); FORZ(th, top_blob.h) { FORZ(tw, top_blob.w) { @@ -831,6 +854,13 @@ inline void unpack_output(float *b, float *a, int width, int height, inline void bnn::bconv_3x3(const Mat &bottom_blob, const Mat &weight, Mat &top_blob, const int stride) { + /** + * This method shows our NC1HWC2 memory layout and Binary + * Direct Convolution. The input tensor and weight is packed + * into NC1HWC2 layout (in the method `pack_weight_3x3` and + * `pack_input_3x3`), the spatial redundancy is then leveraged + * in `bconv_3x3_128_internal_s1`. + */ #ifdef __aarch64__ // TODO: more elegant way static uint64_t packed_weight[999999]; diff --git a/dabnn/bitpack.h b/dabnn/bitpack.h index b90532c..c1cd84c 100644 --- a/dabnn/bitpack.h +++ b/dabnn/bitpack.h @@ -1,4 +1,13 @@ // Copyright 2019 JD.com Inc. JD AI +// +// The step of bit-packing packs N 32-bit float/integer to an N-bit +// operand according their signs. For example, performing bit-packing +// on 128 float numbers produces a 128-bit operand. xnor/xor is only +// enabled on these packed operands. +// +// The method in this file is usually for the packing of input. The +// packing of weight has been performed offline in the step of +// onnx2bnn. #ifndef BITPACK_H #define BITPACK_H @@ -18,6 +27,21 @@ #ifdef __aarch64__ inline void pack_128_2(const float *float_ptr, void *binary_ptr, size_t size) { + /** + * This is the optimized bit-packing. + * + * sri is the "shift-right-and-overwrite" instruction. + * By this instruction, we directly leveraging the existing + * sign bits in 32-bit operands (both IEEE 754 float and + * 32-bit integer). + * Note that the order of bits in the output operand is not + * the consistent with the order of input operands. Fortunately, + * this consistency is not indispensable -- the result of + * xnor/xor is still correct as long as the bits of both input + * and weight are re-arranged in the same way. + * Therefore, we re-arrange the packed weight accordingly in + * dabnn/net.cpp + */ size_t nn_size = size >> 7; asm volatile( @@ -202,6 +226,9 @@ inline void pack_mat_128(const bnn::Mat &float_mat, bnn::Mat &binary_mat) { #endif // __aarch64__ inline void pack_mat_64(const bnn::Mat &float_mat, bnn::Mat &binary_mat) { + /** + * This is the bit-packing for tensor of less than 128 channels. + */ BNN_ASSERT( float_mat.w * float_mat.c > 0 && float_mat.w * float_mat.c % 64 == 0, float_mat.w * float_mat.c); diff --git a/docs/bconv.md b/docs/bconv.md index 4ecaae7..df4114f 100644 --- a/docs/bconv.md +++ b/docs/bconv.md @@ -4,8 +4,8 @@ Bit-packing is performed in `Binarize` layers. It pack N 32-bit float/integer to The details of bit-packing are in -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (optimized, for tensors of 128 and more channels) -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (normal, for tensors of less than 128 channels) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L29 (optimized, for tensors of 128 and more channels) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L228 (normal, for tensors of less than 128 channels) The optmized version is 4X faster than the normal version. Bit-packing algorithm directly leverage the sign bits of int32 and IEEE 754 float numbers, and then eliminate the comparison with zeros. SIMD instructions are also used to speed up this process. Note that after SIMD instructions is performed, the N bit in the result will be re-arranged so that they are not in the same order with the N 32-bit inputs. Fortunately, the output of xnor/xor is not affected as long as the input and weight is re-arranged in the same way. Given this observation, we re-arranged the weights of binary convs whose inputs is bit-packed in the optmized way. The details are in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82. diff --git a/docs/bconv_CN.md b/docs/bconv_CN.md index fe50e09..059f2a9 100644 --- a/docs/bconv_CN.md +++ b/docs/bconv_CN.md @@ -3,8 +3,8 @@ Bit-packing 在 `Binarize` 层进行,是指将 N 个 32 位的 float/integer Bit-packing 的具体实现在 -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (高优化版,针对 128 和以上 channel 数的 tensor) -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (低优化版,针对 128 channel 以下的 tensor) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L29 (高优化版,针对 128 和以上 channel 数的 tensor) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L228 (低优化版,针对 128 channel 以下的 tensor) 高优化版和低优化版的性能差距在 4 倍左右。在高优化版中,bit-packing 算法直接利用 IEEE 754 float 和 int32 的符号位,而不需要把每一个数都和 0 比较,并使用了 SIMD 指令加速这一算法。值得一提的是,使用 SIMD 指令进行 bit-packing 后,输出的 N-bit 操作数的 N 个 bit 和 N 个输入不是按顺序对应的,但只要 xnor/xor 的两个操作数的每个 bit 一一对应,就不会对运算产生任何影响,因此,在适用高优化 bit-packing 的场景下,我们会对 weight 进行重排,使它的每个 bit 和 input 的每个 bit 一一对应,这一步的具体代码在 https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82。