diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c index 24ebde05e70..93feb8f6ef1 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c @@ -58,25 +58,25 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0); const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))); - const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); + v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); + vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1); + vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1); + const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5); const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0); const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1); - const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); - const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))); - const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); - const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))); - const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7); + v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); + v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); + vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1); + vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1); + const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5); const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0); const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1); - const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); - const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32))); - const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); - const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32))); - const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB); + v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); + v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); + vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1); + vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1); + const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5); const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123); const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567); @@ -139,11 +139,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -175,11 +175,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c index 0560ddc6637..1b6d1d1f343 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c @@ -63,32 +63,32 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0); const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))); - const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); + v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); + vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1); + vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1); + const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5); const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0); const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1); - const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); - const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))); - const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); - const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))); - const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7); + v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); + v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); + vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1); + vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1); + const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5); const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0); const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1); - const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); - const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32))); - const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); - const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32))); - const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB); + v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); + v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); + vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1); + vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1); + const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5); const uint64_t vidxCD = wasm_i64x2_extract_lane(vidxCDEF, 0); const uint64_t vidxEF = wasm_i64x2_extract_lane(vidxCDEF, 1); - const float vlC = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); - const float vlD = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32))); - const float vlE = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); - const float vlF = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32))); - const v128_t vlCDEF = wasm_f32x4_make(vlC, vlD, vlE, vlF); + v128_t vlCD = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); + v128_t vlEF = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); + vlCD = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32)), vlCD, 1); + vlEF = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32)), vlEF, 1); + const v128_t vlCDEF = wasm_v32x4_shuffle(vlCD, vlEF, 0, 1, 4, 5); const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123); const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567); @@ -163,11 +163,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -199,11 +199,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c index 5002b95581d..ab1ca48211c 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c @@ -68,39 +68,39 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0); const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))); - const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); + v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); + vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1); + vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1); + const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5); const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0); const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1); - const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); - const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))); - const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); - const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))); - const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7); + v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); + v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); + vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1); + vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1); + const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5); const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0); const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1); - const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); - const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32))); - const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); - const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32))); - const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB); + v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); + v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); + vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1); + vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1); + const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5); const uint64_t vidxCD = wasm_i64x2_extract_lane(vidxCDEF, 0); const uint64_t vidxEF = wasm_i64x2_extract_lane(vidxCDEF, 1); - const float vlC = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); - const float vlD = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32))); - const float vlE = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); - const float vlF = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32))); - const v128_t vlCDEF = wasm_f32x4_make(vlC, vlD, vlE, vlF); + v128_t vlCD = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); + v128_t vlEF = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); + vlCD = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32)), vlCD, 1); + vlEF = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32)), vlEF, 1); + const v128_t vlCDEF = wasm_v32x4_shuffle(vlCD, vlEF, 0, 1, 4, 5); const uint64_t vidxGH = wasm_i64x2_extract_lane(vidxGHIJ, 0); const uint64_t vidxIJ = wasm_i64x2_extract_lane(vidxGHIJ, 1); - const float vlG = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxGH)); - const float vlH = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxGH >> 32))); - const float vlI = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxIJ)); - const float vlJ = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxIJ >> 32))); - const v128_t vlGHIJ = wasm_f32x4_make(vlG, vlH, vlI, vlJ); + v128_t vlGH = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxGH)); + v128_t vlIJ = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxIJ)); + vlGH = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxGH >> 32)), vlGH, 1); + vlIJ = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxIJ >> 32)), vlIJ, 1); + const v128_t vlGHIJ = wasm_v32x4_shuffle(vlGH, vlIJ, 0, 1, 4, 5); const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123); const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567); @@ -187,11 +187,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -223,11 +223,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c index f6f0bc16123..d39afc8fa43 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c @@ -73,46 +73,46 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0); const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))); - const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); + v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); + vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1); + vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1); + const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5); const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0); const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1); - const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); - const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))); - const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); - const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))); - const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7); + v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); + v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); + vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1); + vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1); + const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5); const uint64_t vidx89 = wasm_i64x2_extract_lane(vidx89AB, 0); const uint64_t vidxAB = wasm_i64x2_extract_lane(vidx89AB, 1); - const float vl8 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); - const float vl9 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32))); - const float vlA = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); - const float vlB = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32))); - const v128_t vl89AB = wasm_f32x4_make(vl8, vl9, vlA, vlB); + v128_t vl89 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx89)); + v128_t vlAB = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxAB)); + vl89 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx89 >> 32)), vl89, 1); + vlAB = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxAB >> 32)), vlAB, 1); + const v128_t vl89AB = wasm_v32x4_shuffle(vl89, vlAB, 0, 1, 4, 5); const uint64_t vidxCD = wasm_i64x2_extract_lane(vidxCDEF, 0); const uint64_t vidxEF = wasm_i64x2_extract_lane(vidxCDEF, 1); - const float vlC = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); - const float vlD = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32))); - const float vlE = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); - const float vlF = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32))); - const v128_t vlCDEF = wasm_f32x4_make(vlC, vlD, vlE, vlF); + v128_t vlCD = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxCD)); + v128_t vlEF = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxEF)); + vlCD = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxCD >> 32)), vlCD, 1); + vlEF = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxEF >> 32)), vlEF, 1); + const v128_t vlCDEF = wasm_v32x4_shuffle(vlCD, vlEF, 0, 1, 4, 5); const uint64_t vidxGH = wasm_i64x2_extract_lane(vidxGHIJ, 0); const uint64_t vidxIJ = wasm_i64x2_extract_lane(vidxGHIJ, 1); - const float vlG = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxGH)); - const float vlH = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxGH >> 32))); - const float vlI = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxIJ)); - const float vlJ = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxIJ >> 32))); - const v128_t vlGHIJ = wasm_f32x4_make(vlG, vlH, vlI, vlJ); + v128_t vlGH = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxGH)); + v128_t vlIJ = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxIJ)); + vlGH = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxGH >> 32)), vlGH, 1); + vlIJ = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxIJ >> 32)), vlIJ, 1); + const v128_t vlGHIJ = wasm_v32x4_shuffle(vlGH, vlIJ, 0, 1, 4, 5); const uint64_t vidxKL = wasm_i64x2_extract_lane(vidxKLMN, 0); const uint64_t vidxMN = wasm_i64x2_extract_lane(vidxKLMN, 1); - const float vlK = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxKL)); - const float vlL = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxKL >> 32))); - const float vlM = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxMN)); - const float vlN = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxMN >> 32))); - const v128_t vlKLMN = wasm_f32x4_make(vlK, vlL, vlM, vlN); + v128_t vlKL = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxKL)); + v128_t vlMN = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidxMN)); + vlKL = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxKL >> 32)), vlKL, 1); + vlMN = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidxMN >> 32)), vlMN, 1); + const v128_t vlKLMN = wasm_v32x4_shuffle(vlKL, vlMN, 0, 1, 4, 5); const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123); const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567); @@ -211,11 +211,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -247,11 +247,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c index 3651e8474ee..22371dc1522 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c @@ -46,11 +46,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -82,11 +82,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c index 912d2495d77..89389510003 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c @@ -53,18 +53,18 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0); const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))); - const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl01 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)); + v128_t vl23 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)); + vl01 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32)), vl01, 1); + vl23 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32)), vl23, 1); + const v128_t vl0123 = wasm_v32x4_shuffle(vl01, vl23, 0, 1, 4, 5); const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0); const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1); - const float vl4 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); - const float vl5 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))); - const float vl6 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); - const float vl7 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))); - const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7); + v128_t vl45 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)); + v128_t vl67 = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)); + vl45 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32)), vl45, 1); + vl67 = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32)), vl67, 1); + const v128_t vl4567 = wasm_v32x4_shuffle(vl45, vl67, 0, 1, 4, 5); const v128_t vs0123 = wasm_i32x4_add(vl0123, ve0123); const v128_t vs4567 = wasm_i32x4_add(vl4567, ve4567); @@ -115,11 +115,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -151,11 +151,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); diff --git a/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in b/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in index cf6db17adfd..0042f7d72ea 100644 --- a/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in +++ b/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in @@ -55,11 +55,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( $for N in range(0, BATCH_TILE, 4): const uint64_t vidx${ABC[N:N+2]} = wasm_i64x2_extract_lane(vidx${ABC[N:N+4]}, 0); const uint64_t vidx${ABC[N+2:N+4]} = wasm_i64x2_extract_lane(vidx${ABC[N:N+4]}, 1); - const float vl${ABC[N]} = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx${ABC[N:N+2]})); - const float vl${ABC[N+1]} = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx${ABC[N:N+2]} >> 32))); - const float vl${ABC[N+2]} = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx${ABC[N+2:N+4]})); - const float vl${ABC[N+3]} = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx${ABC[N+2:N+4]} >> 32))); - const v128_t vl${ABC[N:N+4]} = wasm_f32x4_make(vl${ABC[N]}, vl${ABC[N+1]}, vl${ABC[N+2]}, vl${ABC[N+3]}); + v128_t vl${ABC[N:N+2]} = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx${ABC[N:N+2]})); + v128_t vl${ABC[N+2:N+4]} = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx${ABC[N+2:N+4]})); + vl${ABC[N:N+2]} = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx${ABC[N:N+2]} >> 32)), vl${ABC[N:N+2]}, 1); + vl${ABC[N+2:N+4]} = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx${ABC[N+2:N+4]} >> 32)), vl${ABC[N+2:N+4]}, 1); + const v128_t vl${ABC[N:N+4]} = wasm_v32x4_shuffle(vl${ABC[N:N+2]}, vl${ABC[N+2:N+4]}, 0, 1, 4, 5); $for N in range(0, BATCH_TILE, 4): const v128_t vs${ABC[N:N+4]} = wasm_i32x4_add(vl${ABC[N:N+4]}, ve${ABC[N:N+4]}); @@ -111,11 +111,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias); @@ -147,11 +147,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2); const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0); const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1); - const float vl0 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); - const float vl1 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))); - const float vl2 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); - const float vl3 = *((const float*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))); - const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3); + v128_t vl_lo = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)); + v128_t vl_hi = __builtin_wasm_load32_zero((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)); + vl_lo = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); + vl_hi = __builtin_wasm_load32_lane((int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); + const v128_t vl = wasm_v32x4_shuffle(vl_lo, vl_hi, 0, 1, 4, 5); const v128_t vs = wasm_i32x4_add(vl, ve); vn = wasm_f32x4_sub(vn, vmagic_bias);