From b409943809f7a19d3d95e07848525d9a97c2666f Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Wed, 27 Sep 2023 16:53:54 +0200 Subject: [PATCH 1/6] Fuse extract and insert into arm64 ins. --- src/mono/mono/mini/mini-arm64.c | 8 +++--- src/mono/mono/mini/simd-intrinsics.c | 37 ++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 11789395ab1d6..9c3eead0a4177 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -4065,18 +4065,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; } + int idx_to = GTMREG_TO_UINT32 (ins->inst_c0) & 0xff; + int idx_from = GTMREG_TO_UINT32 (ins->inst_c0) >> 8; if (dreg != sreg1) { if (dreg != sreg2) { arm_neon_mov (code, dreg, sreg1); - arm_neon_ins_e(code, t, dreg, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0); + arm_neon_ins_e(code, t, dreg, sreg2, idx_to, idx_from); } else { arm_neon_mov (code, NEON_TMP_REG, sreg1); - arm_neon_ins_e(code, t, NEON_TMP_REG, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0); + arm_neon_ins_e(code, t, NEON_TMP_REG, sreg2, idx_to, idx_from); arm_neon_mov (code, dreg, NEON_TMP_REG); } } else { g_assert (dreg != sreg2); - arm_neon_ins_e(code, t, dreg, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0); + arm_neon_ins_e(code, t, dreg, sreg2, idx_to, idx_from); } break; } diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index c73fc05650335..9dc87666c2de1 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1077,7 +1077,18 @@ emit_vector_create_elementwise ( MonoClass *vklass = mono_class_from_mono_type_internal (vtype); MonoInst *ins = emit_xzero (cfg, vklass); for (int i = 0; i < fsig->param_count; ++i) { - if (!is_zero_const (args [i])) { + if (is_zero_const (args [i])) { + // element already set to zero +#ifdef TARGET_ARM64 + } else if (!COMPILE_LLVM (cfg) && args [i]->opcode == type_to_extract_op (type) && + (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { + // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. + int srcidx = args [i]->inst_c0; + ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->sreg1); + ins->inst_c0 = i | (srcidx << 8); + ins->inst_c1 = type; +#endif + } else { ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->dreg); ins->inst_c0 = i; ins->inst_c1 = type; @@ -1086,6 +1097,7 @@ emit_vector_create_elementwise ( return ins; } + #if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM) static int @@ -2287,10 +2299,25 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } int insert_op = type_to_insert_op (arg0_type); - MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); - ins->inst_c0 = index; - ins->inst_c1 = arg0_type; - return ins; + +#ifdef TARGET_ARM64 + if (!COMPILE_LLVM (cfg) && args [2]->opcode == type_to_extract_op (arg0_type) && (arg0_type == MONO_TYPE_R4 || arg0_type == MONO_TYPE_R8)) { + // Optimize WithElement(GetElement(x, const_1), const_2) into one ins instruction on arm64 + // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. + int srcidx = args [2]->inst_c0; + MonoInst* ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->sreg1); + ins->inst_c0 = index | (srcidx << 8); + ins->inst_c1 = arg0_type; + return ins; + } + else +#endif + { + MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); + ins->inst_c0 = index; + ins->inst_c1 = arg0_type; + return ins; + } } if (!COMPILE_LLVM (cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) From c8c340ed700cf76f88c75cce9c7b8bcc89046587 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Wed, 27 Sep 2023 17:36:29 +0200 Subject: [PATCH 2/6] Extending to vector4. --- src/mono/mono/mini/simd-intrinsics.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 9dc87666c2de1..a873c2941eefb 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2863,10 +2863,24 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); } - ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->dreg); - ins->inst_c0 = index; - ins->inst_c1 = MONO_TYPE_R4; - ins->dreg = dreg; +#ifdef TARGET_ARM64 + if (!COMPILE_LLVM (cfg) && args [2]->opcode == OP_EXTRACT_R4) { + // Optimize x[const_1] = y[const_2] into one ins instruction on arm64 + // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. + int srcidx = args [2]->inst_c0; + ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->sreg1); + ins->inst_c0 = index | (srcidx << 8); + ins->inst_c1 = MONO_TYPE_R4; + return ins; + } + else +#endif + { + ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->dreg); + ins->inst_c0 = index; + ins->inst_c1 = MONO_TYPE_R4; + ins->dreg = dreg; + } if (indirect) { EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, args [0]->dreg, 0, dreg); From 0b73b3f64d8089df6c719d8652c8e114f4d6a7e0 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 29 Sep 2023 11:47:10 +0200 Subject: [PATCH 3/6] Ins index is now a function. Removed newline. --- src/mono/mono/mini/simd-intrinsics.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index a873c2941eefb..08e7a515d92b2 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1068,6 +1068,16 @@ emit_hardware_intrinsics ( return custom_emit (cfg, fsig, args, klass, intrin_group, info, id, arg0_type, is_64bit); } +#ifdef TARGET_ARM64 +static int +arm64_make_ins_index (int destidx, int srcidx, int numelems) +{ + g_assert (destidx < numelems); + g_assert (srcidx < numelems); + return destidx | (srcidx << 8); +} +#endif + static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, @@ -1083,9 +1093,8 @@ emit_vector_create_elementwise ( } else if (!COMPILE_LLVM (cfg) && args [i]->opcode == type_to_extract_op (type) && (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - int srcidx = args [i]->inst_c0; ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->sreg1); - ins->inst_c0 = i | (srcidx << 8); + ins->inst_c0 = arm64_make_ins_index (i, args [i]->inst_c0, fsig->param_count); ins->inst_c1 = type; #endif } else { @@ -1097,7 +1106,6 @@ emit_vector_create_elementwise ( return ins; } - #if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM) static int @@ -2304,9 +2312,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!COMPILE_LLVM (cfg) && args [2]->opcode == type_to_extract_op (arg0_type) && (arg0_type == MONO_TYPE_R4 || arg0_type == MONO_TYPE_R8)) { // Optimize WithElement(GetElement(x, const_1), const_2) into one ins instruction on arm64 // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - int srcidx = args [2]->inst_c0; MonoInst* ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->sreg1); - ins->inst_c0 = index | (srcidx << 8); + ins->inst_c0 = arm64_make_ins_index (index, args [2]->inst_c0, elems); ins->inst_c1 = arg0_type; return ins; } @@ -2867,10 +2874,10 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f if (!COMPILE_LLVM (cfg) && args [2]->opcode == OP_EXTRACT_R4) { // Optimize x[const_1] = y[const_2] into one ins instruction on arm64 // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - int srcidx = args [2]->inst_c0; ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->sreg1); - ins->inst_c0 = index | (srcidx << 8); + ins->inst_c0 = arm64_make_ins_index (index, args [2]->inst_c0, fsig->param_count); ins->inst_c1 = MONO_TYPE_R4; + ins->dreg = dreg; return ins; } else From 5a0bb3bfe668b3c20f78f6432fdc97684211565e Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 29 Sep 2023 13:30:45 +0200 Subject: [PATCH 4/6] Refactoring. --- src/mono/mono/mini/simd-intrinsics.c | 96 ++++++++++------------------ 1 file changed, 33 insertions(+), 63 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 08e7a515d92b2..385ae242e93ca 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1068,41 +1068,42 @@ emit_hardware_intrinsics ( return custom_emit (cfg, fsig, args, klass, intrin_group, info, id, arg0_type, is_64bit); } -#ifdef TARGET_ARM64 -static int -arm64_make_ins_index (int destidx, int srcidx, int numelems) +static MonoInst* +emit_vector_insert_element ( + MonoCompile* cfg, MonoClass* vklass, MonoInst* ins, MonoTypeEnum type, MonoInst* element, + int index, gboolean is_zero_inited) { - g_assert (destidx < numelems); - g_assert (srcidx < numelems); - return destidx | (srcidx << 8); -} + int op = type_to_insert_op (type); + + if (is_zero_inited && is_zero_const (element)) { + // element already set to zero +#ifdef TARGET_ARM64 + } else if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) && + (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { + // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. + ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->sreg1); + ins->inst_c0 = index | ((element->inst_c0) << 8); + ins->inst_c1 = type; #endif + } else { + ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg); + ins->inst_c0 = index; + ins->inst_c1 = type; + } + + return ins; +} static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, MonoTypeEnum type, MonoInst **args) { - int op = type_to_insert_op (type); MonoClass *vklass = mono_class_from_mono_type_internal (vtype); MonoInst *ins = emit_xzero (cfg, vklass); - for (int i = 0; i < fsig->param_count; ++i) { - if (is_zero_const (args [i])) { - // element already set to zero -#ifdef TARGET_ARM64 - } else if (!COMPILE_LLVM (cfg) && args [i]->opcode == type_to_extract_op (type) && - (type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) { - // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->sreg1); - ins->inst_c0 = arm64_make_ins_index (i, args [i]->inst_c0, fsig->param_count); - ins->inst_c1 = type; -#endif - } else { - ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->dreg); - ins->inst_c0 = i; - ins->inst_c1 = type; - } - } + for (int i = 0; i < fsig->param_count; ++i) + emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE); + return ins; } @@ -2300,31 +2301,12 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (args [1]->opcode == OP_ICONST) { // If the index is provably a constant, we can generate vastly better code. int index = GTMREG_TO_INT (args[1]->inst_c0); - if (index < 0 || index >= elems) { MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); } - int insert_op = type_to_insert_op (arg0_type); - -#ifdef TARGET_ARM64 - if (!COMPILE_LLVM (cfg) && args [2]->opcode == type_to_extract_op (arg0_type) && (arg0_type == MONO_TYPE_R4 || arg0_type == MONO_TYPE_R8)) { - // Optimize WithElement(GetElement(x, const_1), const_2) into one ins instruction on arm64 - // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - MonoInst* ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->sreg1); - ins->inst_c0 = arm64_make_ins_index (index, args [2]->inst_c0, elems); - ins->inst_c1 = arg0_type; - return ins; - } - else -#endif - { - MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); - ins->inst_c0 = index; - ins->inst_c1 = arg0_type; - return ins; - } + return emit_vector_insert_element (cfg, klass, args [0], arg0_type, args [2], index, FALSE); } if (!COMPILE_LLVM (cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) @@ -2725,11 +2707,9 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f ins->dreg = dreg; ins->inst_c1 = MONO_TYPE_R4; - for (int i = 1; i < fsig->param_count; ++i) { - ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, ins->dreg, args [i + 1]->dreg); - ins->inst_c0 = i; - ins->inst_c1 = MONO_TYPE_R4; - } + for (int i = 1; i < fsig->param_count; ++i) + ins = emit_vector_insert_element (cfg, klass, ins, MONO_TYPE_R4, args [i + 1], i, FALSE); + ins->dreg = dreg; if (indirect) { @@ -2870,19 +2850,9 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); } -#ifdef TARGET_ARM64 - if (!COMPILE_LLVM (cfg) && args [2]->opcode == OP_EXTRACT_R4) { - // Optimize x[const_1] = y[const_2] into one ins instruction on arm64 - // OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types. - ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->sreg1); - ins->inst_c0 = arm64_make_ins_index (index, args [2]->inst_c0, fsig->param_count); - ins->inst_c1 = MONO_TYPE_R4; - ins->dreg = dreg; - return ins; - } - else -#endif - { + if (args [0]->dreg == dreg) { + ins = emit_vector_insert_element (cfg, klass, args [0], MONO_TYPE_R4, args [2], index, FALSE); + } else { ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->dreg); ins->inst_c0 = index; ins->inst_c1 = MONO_TYPE_R4; From 828103af7c6d90cc504eef7b38cf50691e080e75 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 29 Sep 2023 15:01:39 +0200 Subject: [PATCH 5/6] SIMD extract ops have no side offects (to facilitate their elimination). Fixed bug. --- src/mono/mono/mini/method-to-ir.c | 6 ++++++ src/mono/mono/mini/simd-intrinsics.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/method-to-ir.c b/src/mono/mono/mini/method-to-ir.c index 17f2366846381..ef08bc149faa3 100644 --- a/src/mono/mono/mini/method-to-ir.c +++ b/src/mono/mono/mini/method-to-ir.c @@ -12643,6 +12643,12 @@ mono_op_no_side_effects (int opcode) case OP_NOT_NULL: case OP_IL_SEQ_POINT: case OP_RTTYPE: + case OP_EXTRACT_I1: + case OP_EXTRACT_I2: + case OP_EXTRACT_I4: + case OP_EXTRACT_I8: + case OP_EXTRACT_R4: + case OP_EXTRACT_R8: return TRUE; default: return FALSE; diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 385ae242e93ca..ceebcf784eedc 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1102,7 +1102,7 @@ emit_vector_create_elementwise ( MonoClass *vklass = mono_class_from_mono_type_internal (vtype); MonoInst *ins = emit_xzero (cfg, vklass); for (int i = 0; i < fsig->param_count; ++i) - emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE); + ins = emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE); return ins; } From b388c462f698e2bb307e286d3f05f6ffb2a1aab3 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 29 Sep 2023 15:36:09 +0200 Subject: [PATCH 6/6] Fixed arm32 build. --- src/mono/mono/mini/method-to-ir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mono/mono/mini/method-to-ir.c b/src/mono/mono/mini/method-to-ir.c index ef08bc149faa3..9ed49621efb31 100644 --- a/src/mono/mono/mini/method-to-ir.c +++ b/src/mono/mono/mini/method-to-ir.c @@ -12643,12 +12643,14 @@ mono_op_no_side_effects (int opcode) case OP_NOT_NULL: case OP_IL_SEQ_POINT: case OP_RTTYPE: +#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) || defined(TARGET_ARM64) case OP_EXTRACT_I1: case OP_EXTRACT_I2: case OP_EXTRACT_I4: case OP_EXTRACT_I8: case OP_EXTRACT_R4: case OP_EXTRACT_R8: +#endif return TRUE; default: return FALSE;