Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Leverage experimental Load-Lane WAsm SIMD instructions #1199

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,25 +58,25 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12(

const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0);
const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)));
const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01));
v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23));
vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1);
vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1);
const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5);
const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0);
const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1);
const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45));
const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)));
const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67));
const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)));
const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7);
v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45));
v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67));
vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1);
vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1);
const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5);
const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0);
const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1);
const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89));
const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)));
const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB));
const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)));
const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB);
v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89));
v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB));
vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1);
vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1);
const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5);

const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123);
const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567);
Expand Down Expand Up @@ -139,11 +139,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12(
const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)));
const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5);

const v128_t vs = wasm_i32x4_add(vl, ve);
vn = wasm_f32x4_sub(vn, vmagic_bias);
Expand Down Expand Up @@ -175,11 +175,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12(
const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)));
const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5);

const v128_t vs = wasm_i32x4_add(vl, ve);
vn = wasm_f32x4_sub(vn, vmagic_bias);
Expand Down
60 changes: 30 additions & 30 deletions src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,32 +63,32 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16(

const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0);
const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)));
const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01));
v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23));
vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1);
vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1);
const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5);
const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0);
const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1);
const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45));
const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)));
const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67));
const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)));
const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7);
v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45));
v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67));
vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1);
vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1);
const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5);
const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0);
const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1);
const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89));
const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)));
const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB));
const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)));
const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB);
v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89));
v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB));
vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1);
vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1);
const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5);
const uint64_t vidxCD = wasm_i64x2_extract_lane(vidxCDEF, 0);
const uint64_t vidxEF = wasm_i64x2_extract_lane(vidxCDEF, 1);
const float vlC = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD));
const float vlD = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32)));
const float vlE = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF));
const float vlF = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32)));
const v128_t vlCDEF = wasm_f32x4_make(vlC, vlD, vlE, vlF);
v128_t vlCD = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD));
v128_t vlEF = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF));
vlCD = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32)), vlCD, 1);
vlEF = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32)), vlEF, 1);
const v128_t vlCDEF = wasm_v32x4_shuffle(vlCD, vlEF, 0, 1, 4, 5);

const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123);
const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567);
Expand Down Expand Up @@ -163,11 +163,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16(
const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)));
const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5);

const v128_t vs = wasm_i32x4_add(vl, ve);
vn = wasm_f32x4_sub(vn, vmagic_bias);
Expand Down Expand Up @@ -199,11 +199,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16(
const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)));
const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)));
const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo));
v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi));
vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5);

const v128_t vs = wasm_i32x4_add(vl, ve);
vn = wasm_f32x4_sub(vn, vmagic_bias);
Expand Down
Loading