Skip to content

Commit

Permalink
Fix bug of armv7 bgemm, add armv7 implemenation of maxpool and avepoo…
Browse files Browse the repository at this point in the history
…l (test not passed)
  • Loading branch information
daquexian committed May 29, 2019
1 parent 670b2ec commit 48cde00
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 286 deletions.
2 changes: 1 addition & 1 deletion .daq_pm/configs/bgemm_test_v7
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ name binary-nn
type cpp
build_dir build_test_v7
target bgemm_test
cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-28 -DANDROID_ABI=armeabi-v7a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -GNinja
cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-25 -DANDROID_ABI=armeabi-v7a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -GNinja
binary ~/adb_push_and_run.sh tests/bgemm_test
7 changes: 7 additions & 0 deletions .daq_pm/configs/net_test_v7
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# It is a configuration file for [project_manager.vim](https://github.com/daquexian/project_manager.vim)
name binary-nn
type cpp
build_dir build_test_v7
target net_test
cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-21 -DANDROID_ABI=armeabi-v7a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -GNinja
binary ~/adb_push_and_run.sh tests/net_test
282 changes: 5 additions & 277 deletions dabnn/bgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -562,12 +562,13 @@ inline void micro_kernel(int64_t kc, float *c, const uint64_t *a,
"vzip.u32 q13, q15 \n"
"vzip.u32 q12, q13 \n"
"vadd.u32 q3, q3, q12 \n"

"bne 0b \n"

"vcvt.u32.f32 q0, q0 \n"
"vcvt.u32.f32 q1, q1 \n"
"vcvt.u32.f32 q2, q2 \n"
"vcvt.u32.f32 q3, q3 \n"
"vcvt.f32.u32 q0, q0 \n"
"vcvt.f32.u32 q1, q1 \n"
"vcvt.f32.u32 q2, q2 \n"
"vcvt.f32.u32 q3, q3 \n"
"vst1.32 q0, [%1]! \n"
"vst1.32 q1, [%1]! \n"
"vst1.32 q2, [%1]! \n"
Expand All @@ -579,279 +580,6 @@ inline void micro_kernel(int64_t kc, float *c, const uint64_t *a,
:
: "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
/*
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64 \n"
"ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0] \n"
"0: \n"
"ld1 {v18.2d, v19.2d, v20.2d, v21.2d}, [%3], #64 \n"
"ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [%2], #64 \n"
"eor v26.16b, v12.16b, v18.16b \n"
"eor v27.16b, v12.16b, v19.16b \n"
"eor v28.16b, v12.16b, v20.16b \n"
"eor v29.16b, v12.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [%3], #64 \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"ld1 {v16.2d, v17.2d}, [%2], #32 \n"
"eor v27.16b, v12.16b, v22.16b \n"
"eor v28.16b, v12.16b, v23.16b \n"
"eor v29.16b, v12.16b, v24.16b \n"
"eor v30.16b, v12.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v0.4s, v0.4s, v26.4s \n" // delay
"add v1.4s, v1.4s, v27.4s \n"
// The No.1 col of C is finished
"eor v26.16b, v13.16b, v18.16b \n"
"eor v27.16b, v13.16b, v19.16b \n"
"eor v28.16b, v13.16b, v20.16b \n"
"eor v29.16b, v13.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"eor v27.16b, v13.16b, v22.16b \n"
"eor v28.16b, v13.16b, v23.16b \n"
"eor v29.16b, v13.16b, v24.16b \n"
"eor v30.16b, v13.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v2.4s, v2.4s, v26.4s \n" // delay
"add v3.4s, v3.4s, v27.4s \n"
// The No.2 col of C is finished
"eor v26.16b, v14.16b, v18.16b \n"
"eor v27.16b, v14.16b, v19.16b \n"
"eor v28.16b, v14.16b, v20.16b \n"
"eor v29.16b, v14.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"eor v27.16b, v14.16b, v22.16b \n"
"eor v28.16b, v14.16b, v23.16b \n"
"eor v29.16b, v14.16b, v24.16b \n"
"eor v30.16b, v14.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v4.4s, v4.4s, v26.4s \n" // delay
"add v5.4s, v5.4s, v27.4s \n"
// The No.3 col of C is finished
"prfm pldl1keep, [%3, #128] \n"
"eor v26.16b, v15.16b, v18.16b \n"
"eor v27.16b, v15.16b, v19.16b \n"
"eor v28.16b, v15.16b, v20.16b \n"
"eor v29.16b, v15.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"prfm pldl1keep, [%2, #128] \n"
"eor v27.16b, v15.16b, v22.16b \n"
"eor v28.16b, v15.16b, v23.16b \n"
"eor v29.16b, v15.16b, v24.16b \n"
"eor v30.16b, v15.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v6.4s, v6.4s, v26.4s \n" // delay
"add v7.4s, v7.4s, v27.4s \n"
// The No.4 col of C is finished
"eor v26.16b, v16.16b, v18.16b \n"
"eor v27.16b, v16.16b, v19.16b \n"
"eor v28.16b, v16.16b, v20.16b \n"
"eor v29.16b, v16.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"eor v27.16b, v16.16b, v22.16b \n"
"eor v28.16b, v16.16b, v23.16b \n"
"eor v29.16b, v16.16b, v24.16b \n"
"eor v30.16b, v16.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v8.4s, v8.4s, v26.4s \n" // delay
"add v9.4s, v9.4s, v27.4s \n"
// The No.5 col of C is finished
"eor v26.16b, v17.16b, v18.16b \n"
"eor v27.16b, v17.16b, v19.16b \n"
"eor v28.16b, v17.16b, v20.16b \n"
"eor v29.16b, v17.16b, v21.16b \n"
"cnt v26.16b, v26.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"addv b26, v26.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"ins v26.s[1], v27.s[0] \n"
"ins v26.s[2], v28.s[0] \n"
"ins v26.s[3], v29.s[0] \n"
"subs %0, %0, #1 \n"
"eor v27.16b, v17.16b, v22.16b \n"
"eor v28.16b, v17.16b, v23.16b \n"
"eor v29.16b, v17.16b, v24.16b \n"
"eor v30.16b, v17.16b, v25.16b \n"
"cnt v27.16b, v27.16b \n"
"cnt v28.16b, v28.16b \n"
"cnt v29.16b, v29.16b \n"
"cnt v30.16b, v30.16b \n"
"addv b27, v27.16b \n"
"addv b28, v28.16b \n"
"addv b29, v29.16b \n"
"addv b30, v30.16b \n"
"ins v27.s[1], v28.s[0] \n"
"ins v27.s[2], v29.s[0] \n"
"ins v27.s[3], v30.s[0] \n"
"add v10.4s, v10.4s, v26.4s \n" // delay
"add v11.4s, v11.4s, v27.4s \n"
// The No.6 col of C is finished
"bne 0b \n"
"ucvtf v0.4s, v0.4s \n"
"ucvtf v1.4s, v1.4s \n"
"ucvtf v2.4s, v2.4s \n"
"ucvtf v3.4s, v3.4s \n"
"ucvtf v4.4s, v4.4s \n"
"ucvtf v5.4s, v5.4s \n"
"st1 {v0.4s}, [%1], #16 \n"
"ucvtf v6.4s, v6.4s \n"
"st1 {v1.4s}, [%1], #16 \n"
"ucvtf v7.4s, v7.4s \n"
"st1 {v2.4s}, [%1], #16 \n"
"ucvtf v8.4s, v8.4s \n"
"st1 {v3.4s}, [%1], #16 \n"
"ucvtf v9.4s, v9.4s \n"
"st1 {v4.4s}, [%1], #16 \n"
"ucvtf v10.4s, v10.4s \n"
"st1 {v5.4s}, [%1], #16 \n"
"ucvtf v11.4s, v11.4s \n"
"st1 {v6.4s}, [%1], #16 \n"
"st1 {v7.4s}, [%1], #16 \n"
"st1 {v8.4s}, [%1], #16 \n"
"st1 {v9.4s}, [%1], #16 \n"
"st1 {v10.4s}, [%1], #16 \n"
"st1 {v11.4s}, [%1], #16 \n"
: "+r"(kc), // %0
"+r"(c), // %1
"+r"(b), // %2
"+r"(a) // %3
:
: "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28", "v29", "v30");
*/

#endif // __aarch64__
}
#endif // __ARM_NEON
Expand Down
38 changes: 34 additions & 4 deletions dabnn/layers/AvePool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

namespace bnn {

#ifdef __aarch64__
#ifdef __ARM_NEON
void ave_pool_2x2_s2(const bnn::Mat &input, bnn::Mat &output) {
FORZ(h, output.h) {
FORZ(w, output.w) {
Expand All @@ -17,6 +17,7 @@ void ave_pool_2x2_s2(const bnn::Mat &input, bnn::Mat &output) {
const float *ptr3 = input.point<float>(h * 2 + 1, w * 2 + 1);
float *output_ptr = output.point<float>(h, w);
size_t nn = input.c >> 2;
#ifdef __aarch64__
asm volatile(
"fmov s30, #4.0 \n"
"dup v30.4s, v30.s[0] \n"
Expand Down Expand Up @@ -46,10 +47,39 @@ void ave_pool_2x2_s2(const bnn::Mat &input, bnn::Mat &output) {
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v30");
#else // __aarch64__
asm volatile(
"vmov.f32 q13, #0.25 \n"
"0: \n"
"vld1.32 q0, [%0]! \n"
"pld [%0, #128] \n"
"vld1.32 q1, [%1]! \n"
"pld [%1, #128] \n"
"vld1.32 q2, [%2]! \n"
"pld [%2, #128] \n"
"vld1.32 q3, [%3]! \n"
"pld [%3, #128] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q2, q2, q3 \n"
"vadd.f32 q0, q0, q2 \n"
"vmul.f32 q0, q0, q13 \n"
"subs %5, %5, #1 \n"
"vst1.32 q0, [%4]! \n"
"bne 0b \n"

: "+r"(ptr0), // %0
"+r"(ptr1), // %1
"+r"(ptr2), // %2
"+r"(ptr3), // %3
"+r"(output_ptr), // %4
"+r"(nn) // %5
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q13");
#endif // __aarch64__
}
}
}
#endif // __aarch64__
#endif // __ARM_NEON

void ave_pool_fallback(const bnn::Mat &input, const size_t pad_h,
const size_t pad_w, const size_t stride_h,
Expand Down Expand Up @@ -116,7 +146,7 @@ AvePool::AvePool(NetCP net, const std::string &name, css input, css output,
}

void AvePool::forward_impl() const {
#ifdef __aarch64__
#ifdef __ARM_NEON
if (stride_h == 2 && stride_w == 2 && kernel_h == 2 && kernel_w == 2 &&
input_mat->c % 4 == 0) {
pad(*input_mat, pad_h, pad_w, *padded_mat);
Expand All @@ -128,7 +158,7 @@ void AvePool::forward_impl() const {
#else
ave_pool_fallback(*input_mat, pad_h, pad_w, stride_h, stride_w,
kernel_h, kernel_w, *output_mat);
#endif // __aarch64__
#endif // __ARM_NEON
}

} // namespace bnn
Loading

0 comments on commit 48cde00

Please sign in to comment.