Skip to content

Commit

Permalink
[mono][jit] Fuse SIMD extract and insert on arm64 (#92714)
Browse files Browse the repository at this point in the history
* Fuse extract and insert into arm64 ins.

* Extending to vector4.

* Ins index is now a function. Removed newline.

* Refactoring.

* SIMD extract ops have no side offects (to facilitate their elimination). Fixed bug.

* Fixed arm32 build.
  • Loading branch information
jandupej committed Oct 4, 2023
1 parent 55945c6 commit 6486250
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 26 deletions.
8 changes: 8 additions & 0 deletions src/mono/mono/mini/method-to-ir.c
Original file line number Diff line number Diff line change
Expand Up @@ -12746,6 +12746,14 @@ mono_op_no_side_effects (int opcode)
case OP_NOT_NULL:
case OP_IL_SEQ_POINT:
case OP_RTTYPE:
#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) || defined(TARGET_ARM64)
case OP_EXTRACT_I1:
case OP_EXTRACT_I2:
case OP_EXTRACT_I4:
case OP_EXTRACT_I8:
case OP_EXTRACT_R4:
case OP_EXTRACT_R8:
#endif
return TRUE;
default:
return FALSE;
Expand Down
8 changes: 5 additions & 3 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -4061,18 +4061,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
break;
}

int idx_to = GTMREG_TO_UINT32 (ins->inst_c0) & 0xff;
int idx_from = GTMREG_TO_UINT32 (ins->inst_c0) >> 8;
if (dreg != sreg1) {
if (dreg != sreg2) {
arm_neon_mov (code, dreg, sreg1);
arm_neon_ins_e(code, t, dreg, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0);
arm_neon_ins_e(code, t, dreg, sreg2, idx_to, idx_from);
} else {
arm_neon_mov (code, NEON_TMP_REG, sreg1);
arm_neon_ins_e(code, t, NEON_TMP_REG, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0);
arm_neon_ins_e(code, t, NEON_TMP_REG, sreg2, idx_to, idx_from);
arm_neon_mov (code, dreg, NEON_TMP_REG);
}
} else {
g_assert (dreg != sreg2);
arm_neon_ins_e(code, t, dreg, sreg2, GTMREG_TO_UINT32 (ins->inst_c0), 0);
arm_neon_ins_e(code, t, dreg, sreg2, idx_to, idx_from);
}
break;
}
Expand Down
64 changes: 41 additions & 23 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1068,21 +1068,42 @@ emit_hardware_intrinsics (
return custom_emit (cfg, fsig, args, klass, intrin_group, info, id, arg0_type, is_64bit);
}

static MonoInst*
emit_vector_insert_element (
MonoCompile* cfg, MonoClass* vklass, MonoInst* ins, MonoTypeEnum type, MonoInst* element,
int index, gboolean is_zero_inited)
{
int op = type_to_insert_op (type);

if (is_zero_inited && is_zero_const (element)) {
// element already set to zero
#ifdef TARGET_ARM64
} else if (!COMPILE_LLVM (cfg) && element->opcode == type_to_extract_op (type) &&
(type == MONO_TYPE_R4 || type == MONO_TYPE_R8)) {
// OP_INSERT_Ix inserts from GP reg, not SIMD. Cannot optimize for int types.
ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->sreg1);
ins->inst_c0 = index | ((element->inst_c0) << 8);
ins->inst_c1 = type;
#endif
} else {
ins = emit_simd_ins (cfg, vklass, op, ins->dreg, element->dreg);
ins->inst_c0 = index;
ins->inst_c1 = type;
}

return ins;
}

static MonoInst *
emit_vector_create_elementwise (
MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype,
MonoTypeEnum type, MonoInst **args)
{
int op = type_to_insert_op (type);
MonoClass *vklass = mono_class_from_mono_type_internal (vtype);
MonoInst *ins = emit_xzero (cfg, vklass);
for (int i = 0; i < fsig->param_count; ++i) {
if (!is_zero_const (args [i])) {
ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->dreg);
ins->inst_c0 = i;
ins->inst_c1 = type;
}
}
for (int i = 0; i < fsig->param_count; ++i)
ins = emit_vector_insert_element (cfg, vklass, ins, type, args[i], i, TRUE);

return ins;
}

Expand Down Expand Up @@ -2282,17 +2303,12 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
if (args [1]->opcode == OP_ICONST) {
// If the index is provably a constant, we can generate vastly better code.
int index = GTMREG_TO_INT (args[1]->inst_c0);

if (index < 0 || index >= elems) {
MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems);
MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException");
}

int insert_op = type_to_insert_op (arg0_type);
MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg);
ins->inst_c0 = index;
ins->inst_c1 = arg0_type;
return ins;
return emit_vector_insert_element (cfg, klass, args [0], arg0_type, args [2], index, FALSE);
}

if (!COMPILE_LLVM (cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST)
Expand Down Expand Up @@ -2690,11 +2706,9 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
ins->dreg = dreg;
ins->inst_c1 = MONO_TYPE_R4;

for (int i = 1; i < fsig->param_count; ++i) {
ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, ins->dreg, args [i + 1]->dreg);
ins->inst_c0 = i;
ins->inst_c1 = MONO_TYPE_R4;
}
for (int i = 1; i < fsig->param_count; ++i)
ins = emit_vector_insert_element (cfg, klass, ins, MONO_TYPE_R4, args [i + 1], i, FALSE);

ins->dreg = dreg;

if (indirect) {
Expand Down Expand Up @@ -2835,10 +2849,14 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException");
}

ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->dreg);
ins->inst_c0 = index;
ins->inst_c1 = MONO_TYPE_R4;
ins->dreg = dreg;
if (args [0]->dreg == dreg) {
ins = emit_vector_insert_element (cfg, klass, args [0], MONO_TYPE_R4, args [2], index, FALSE);
} else {
ins = emit_simd_ins (cfg, klass, OP_INSERT_R4, dreg, args [2]->dreg);
ins->inst_c0 = index;
ins->inst_c1 = MONO_TYPE_R4;
ins->dreg = dreg;
}

if (indirect) {
EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, args [0]->dreg, 0, dreg);
Expand Down

0 comments on commit 6486250

Please sign in to comment.