1. Apply new bitpack

2. Fix bug of the sign bit (1 if negative) vs padding (0 if negative) by "not" the packed element 3. Update binrep 4. misc changes
JDAI-CV · May 17, 2019 · 2618327 · 2618327
1 parent f65fd3e
commit 2618327
Show file tree

Hide file tree

Showing 12 changed files with 121 additions and 126 deletions.
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -13,6 +13,22 @@
 #include <dabnn/mat.h>
 #include <dabnn/net.h>
 
+static void BM_pack_mat_64_small(benchmark::State &state) {
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    for (auto _ : state) {
+        pack_mat_64(a, b);
+    }
+}
+
+static void BM_pack_mat_128_small(benchmark::State &state) {
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    for (auto _ : state) {
+        pack_mat_128(a, b);
+    }
+}
+
 static void BM_pack_mat_64(benchmark::State &state) {
     const bnn::Mat a(1, 64, 64, 128, bnn::DataType::Float);
     bnn::Mat b(1, 64, 64, 128, bnn::DataType::Bit);

diff --git a/binaries/run.cpp b/binaries/run.cpp
@@ -5,20 +5,20 @@
 #include <algorithm>
 #include <chrono>
 
+#include <common/argh.h>
 #include <common/flatbuffers_helper.h>
 #include <dabnn/net.h>
 
 int main(int argc, char **argv) {
-    (void)argc;
+    argh::parser cmdl(argc, argv);
     google::InitGoogleLogging(argv[0]);
-    FLAGS_v = 1;
+    cmdl("v", 1) >> FLAGS_v;
     FLAGS_alsologtostderr = true;
     // FLAGS_logbuflevel = -1;
 
     float *input = new float[3 * 224 * 224];
     FORZ(i, 3 * 224 * 224) { input[i] = 1; }
 
-    // const std::string blob_name = "125";
     auto net1 = bnn::Net::create();
     net1->optimize = true;
     net1->run_fconv = true;
@@ -31,7 +31,6 @@ int main(int argc, char **argv) {
     FORZ(i, N) {
         LOG(INFO) << "------";
         net1->run(input);
-        // LOG(INFO) << "hh";
     }
     const auto t2 = Clock::now();
     css blob_name = argv[2];
@@ -45,7 +44,7 @@ int main(int argc, char **argv) {
         if (blob1->data_type == bnn::DataType::Float) {
             LOG(INFO) << static_cast<float *>(blob1->data)[i];
         } else {
-            LOG(INFO) << binrep(static_cast<uint64_t *>(blob1->data)[i]);
+            LOG(INFO) << binrep(static_cast<uint64_t *>(blob1->data) + i, 64, false);
         }
     }
     LOG(INFO) << "Time: "

diff --git a/common/baseline.h b/common/baseline.h
@@ -6,7 +6,7 @@
 #include <bitset>
 
 #include <common/helper.h>
-#include <dabnn/bitpack.h>
+#include <common/common_bitpack.h>
 #include <dabnn/mat.h>
 
 inline int bitcount(uint64_t x) {

diff --git a/common/common_bitpack.h b/common/common_bitpack.h
@@ -8,6 +8,7 @@
 #include <cstdint>
 
 #include <common/helper.h>
+#include <dabnn/mat.h>
 
 inline void pack_128_fallback(const float *float_ptr, void *binary_ptr,
                               size_t size) {
@@ -183,4 +184,23 @@ inline void pack_64_bitfield(const float *fptr, uint64_t *buf) {
     *buf = u.u64;
 }
 
+inline void pack_mat_64(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
+    BNN_ASSERT(
+        float_mat.w * float_mat.c > 0 && float_mat.w * float_mat.c % 64 == 0,
+        float_mat.w * float_mat.c);
+    BNN_ASSERT(float_mat.c / 64 == binary_mat.c && float_mat.c % 64 == 0, "");
+
+    FORZ(n, float_mat.n) {
+        FORZ(h, float_mat.h) {
+            auto *fptr = float_mat.point<float>(n, h, 0);
+            auto *bptr = binary_mat.point<uint64_t>(n, h, 0);
+            FORZ(i, float_mat.w * float_mat.c / 64) {
+                pack_64_bitfield(fptr, bptr);
+                fptr += 64;
+                bptr++;
+            }
+        }
+    }
+}
+
 #endif /* COMMON_BITPACK_H */
diff --git a/common/helper.h b/common/helper.h
@@ -80,6 +80,9 @@ inline float random_float() {
     static std::normal_distribution<float> distr;
 
     float rand_float = distr(eng) / 10;
+    if (rand_float == 0) {
+        return random_float();
+    }
     // LOG(INFO) << "Random float: " << rand_float;
 
     return rand_float;
@@ -113,27 +116,20 @@ inline void fill_rand_uint64(uint64_t *data, size_t num) {
     FORZ(i, num) { *(data + i) = random_uint64(); }
 }
 
-template <typename T>
-std::string binrep(const T &a) {
-    const char *beg = reinterpret_cast<const char *>(&a);
-    const char *end = beg + sizeof(a);
-
-    std::stringstream ss;
-
-    while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
-    ss << '\n';
-    return ss.str();
-}
-
-template <typename T>
-std::string binrep(const T &a, const size_t size) {
-    const char *beg = reinterpret_cast<const char *>(&a);
+/**
+ * parameter human will make the output on little endian machines human-readable
+ */
+inline std::string binrep(const void *a, const size_t size, bool reverse) {
+    const char *beg = static_cast<const char *>(a);
     const char *end = beg + size;
 
     std::stringstream ss;
 
-    while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
-    ss << '\n';
+    if (reverse) {
+        while (beg != end) ss << std::bitset<CHAR_BIT>(*(end-- - 1)) << ' ';
+    } else {
+        while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
+    }
     return ss.str();
 }
 

diff --git a/dabnn/bitpack.h b/dabnn/bitpack.h
@@ -16,94 +16,6 @@
 #include <glog/logging.h>
 #include "mat.h"
 
-inline void pack_mat_64(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
-    BNN_ASSERT(
-        float_mat.w * float_mat.c > 0 && float_mat.w * float_mat.c % 64 == 0,
-        float_mat.w * float_mat.c);
-    BNN_ASSERT(float_mat.c / 64 == binary_mat.c && float_mat.c % 64 == 0, "");
-
-    FORZ(n, float_mat.n) {
-        FORZ(h, float_mat.h) {
-            auto *fptr = float_mat.point<float>(n, h, 0);
-            auto *bptr = binary_mat.point<uint64_t>(n, h, 0);
-            FORZ(i, float_mat.w * float_mat.c / 64) {
-                pack_64_bitfield(fptr, bptr);
-                fptr += 64;
-                bptr++;
-            }
-        }
-    }
-}
-
-inline void pack_128_3(const float *float_ptr, void *binary_ptr, size_t size) {
-    size_t nn_size = size >> 7;
-
-    asm volatile(
-        "0:     \n"
-        "prfm   pldl1keep, [%0]     \n"
-        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64    \n"
-        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64    \n"
-        "sri    v0.4s, v1.4s, #1    \n"
-        "sri    v0.4s, v2.4s, #1    \n"
-        "sri    v0.4s, v3.4s, #1    \n"
-        "sri    v0.4s, v4.4s, #1    \n"
-
-        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64    \n"
-        "prfm   pldl1keep, [%0, #64]     \n"
-        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64    \n"
-        "sri    v0.4s, v5.4s, #1    \n"
-        "sri    v0.4s, v6.4s, #1    \n"
-        "sri    v0.4s, v7.4s, #1    \n"
-        "sri    v0.4s, v8.4s, #1    \n"
-
-        "subs   %2, %2, #1          \n"
-
-        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64    \n"
-        "prfm   pldl1keep, [%0, #64]     \n"
-        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64    \n"
-
-        "sri    v0.4s, v9.4s, #2    \n"
-        "sri    v0.4s, v10.4s, #2    \n"
-        "sri    v0.4s, v11.4s, #2   \n"
-        "sri    v0.4s, v12.4s, #2   \n"
-
-        "sri    v0.4s, v13.4s, #1    \n"
-        "sri    v0.4s, v14.4s, #1    \n"
-        "sri    v0.4s, v15.4s, #1    \n"
-        "sri    v0.4s, v16.4s, #1    \n"
-
-        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64    \n"
-        "prfm   pldl1keep, [%0, #64]     \n"
-        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64    \n"
-        "sri    v0.4s, v17.4s, #1    \n"
-        "sri    v0.4s, v18.4s, #1    \n"
-        "sri    v0.4s, v19.4s, #1    \n"
-        "sri    v0.4s, v20.4s, #1    \n"
-
-        "sri    v0.4s, v21.4s, #2   \n"
-        "sri    v0.4s, v22.4s, #2   \n"
-        "sri    v0.4s, v23.4s, #2   \n"
-        "sri    v0.4s, v24.4s, #2   \n"
-
-        "sri    v0.4s, v25.4s, #4   \n"
-        "sri    v0.4s, v26.4s, #4   \n"
-        "sri    v0.4s, v27.4s, #4   \n"
-        "sri    v0.4s, v28.4s, #4   \n"
-
-        "sri    v0.4s, v29.4s, #8    \n"
-        "sri    v0.4s, v31.4s, #8    \n"
-        "sri    v0.4s, v30.4s, #16    \n"
-
-        "st1    {v0.4s}, [%1], #16         \n"
-        "bne    0b                  \n"
-        : "+r"(float_ptr),   // %0
-          "+r"(binary_ptr),  // %1
-          "+r"(nn_size)      // %2
-        :
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-            "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-            "v19", "x0");
-}
 inline void pack_128_2(const float *float_ptr, void *binary_ptr, size_t size) {
     size_t nn_size = size >> 7;
 
@@ -163,6 +75,17 @@ inline void pack_128_2(const float *float_ptr, void *binary_ptr, size_t size) {
         "sri    v2.4s, v3.4s, #8    \n"
         "sri    v0.4s, v2.4s, #16    \n"
 
+        // Bit-packing with sign bit is introduced after the first version
+        // of dabnn is published. Sign bit will be 1 when x < 0, 0 when x > 0,
+        // which is different with the way we used before --- set bit to 1 if
+        // x > 0 or 0 if x < 0
+        // So for the compatibility we add a "not" instruction here.
+        // Maybe we can save this instruction by introducing "version" for
+        // dabnn model and force users to upgrade.
+        // Note: If this line is removed, the padding value of binary convolution
+        // should also be changed from 0 (-1 in xnor) to -1 (1 in xnor)
+        "not    v0.16b, v0.16b        \n"
+
         "st1    {v0.4s}, [%1], #16         \n"
         "bne    0b                  \n"
         : "+r"(float_ptr),   // %0
@@ -171,8 +94,9 @@ inline void pack_128_2(const float *float_ptr, void *binary_ptr, size_t size) {
         :
         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
             "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-            "v19", "x0");
+            "v19", "v20", "v21", "v22", "v23", "x0");
 }
+
 inline void pack_128(const float *float_ptr, void *binary_ptr, size_t size) {
     size_t nn_size = size >> 7;
 
@@ -261,17 +185,24 @@ inline void pack_128(const float *float_ptr, void *binary_ptr, size_t size) {
           "x0");
 }
 
-inline void pack_mat_128(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
+inline void pack_mat_128_2(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
     assert(!binary_mat.empty());
 
     pack_128_2(static_cast<float *>(float_mat.data), binary_mat.data,
              float_mat.total());
 }
 
+inline void pack_mat_128(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
+    assert(!binary_mat.empty());
+
+    pack_128(static_cast<float *>(float_mat.data), binary_mat.data,
+             float_mat.total());
+}
+
 inline void pack_mat(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
     BNN_ASSERT(float_mat.c % 64 == 0, float_mat.c);
     if (float_mat.c % 128 == 0) {
-        pack_mat_128(float_mat, binary_mat);
+        pack_mat_128_2(float_mat, binary_mat);
     } else {
         pack_mat_64(float_mat, binary_mat);
     }

diff --git a/dabnn/layers/BinConv.cpp b/dabnn/layers/BinConv.cpp
@@ -114,7 +114,7 @@ std::string BinConv::to_str() const {
     std::stringstream ss;
     ss << type_ << ", ";
     PNT_TO(ss, input_mat->h, input_mat->w, input_mat->elem_c, weight_mat->h,
-           weight_mat->w, weight_mat->n);
+           weight_mat->w, weight_mat->n, pad_h, pad_w);
 
     return ss.str();
 }

diff --git a/dabnn/layers/Binarize.cpp b/dabnn/layers/Binarize.cpp
@@ -3,8 +3,15 @@
 #include "Binarize.h"
 
 #include <dabnn/bitpack.h>
+#include <dabnn/net.h>
 
 namespace bnn {
-void Binarize::forward_impl() const { ::pack_mat_64(*input_mat, *output_mat); }
+void Binarize::forward_impl() const { 
+    if (net_.lock()->new_bitpack) {
+        ::pack_mat(*input_mat, *output_mat); 
+    } else {
+        ::pack_mat_64(*input_mat, *output_mat); 
+    }
+}
 
 }  // namespace bnn
diff --git a/dabnn/mat.h b/dabnn/mat.h
@@ -285,8 +285,9 @@ inline std::ostream &operator<<(std::ostream &os, const Mat &mat) {
     os << "n: " << mat.n << ", width: " << mat.w << ", height: " << mat.h
        << ", channels: " << mat.c << std::endl;
     if (mat.data_type == DataType::Bit) {
-        return os << binrep(*static_cast<char *>(mat.data),
-                            std::min(mat.total(), size_t{10}) * mat.elemsize);
+        return os << binrep(static_cast<char *>(mat.data),
+                            std::min(mat.total(), size_t{10}) * mat.elemsize,
+                            true);
     } else {
         for (size_t i = 0;
              i < std::min(static_cast<decltype(mat.total())>(10), mat.total());