Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono][jit] Add Vector128.WithElement as intrinsic on arm64. #85158

Merged
merged 3 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -521,12 +521,12 @@ expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4
insert_i1: dest:x src1:i len:4
insert_i2: dest:x src1:i len:4
insert_i4: dest:x src1:i len:4
insert_i8: dest:x src1:i len:4
insert_r4: dest:x src1:f len:4
insert_r8: dest:x src1:f len:4
insert_i1: dest:x src1:x src2:i len:8
insert_i2: dest:x src1:x src2:i len:8
insert_i4: dest:x src1:x src2:i len:8
insert_i8: dest:x src1:x src2:i len:8
insert_r4: dest:x src1:x src2:f len:8
insert_r8: dest:x src1:x src2:f len:8
create_scalar_int: dest:x src1:i len:8
create_scalar_float: dest:x src1:f len:12
create_scalar_unsafe_int: dest:x src1:i len:4
Expand All @@ -542,6 +542,8 @@ arm64_fcvtn2: dest:x src1:x src2:x len:4 clob:1
xunop: dest:x src1:x len:4
arm64_ushl: dest:x src1:x src2:x len:4
arm64_ext_imm: dest:x src1:x src2:x len:4
xinsert_i8: dest:x src1:x src2:i src3:i len:20
xinsert_r8: dest:x src1:x src2:f src3:i len:20

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
173 changes: 110 additions & 63 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,65 @@ mono_arch_finish_init (void)
{
}

static gboolean
is_type_float_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8);
}

static gboolean
is_type_unsigned_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8);
}

static int
get_vector_size_macro (MonoInst *ins)
{
g_assert (ins->klass);
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
return VREG_FULL;
case 8:
return VREG_LOW;
default:
g_assert_not_reached ();
}
}

static int
get_type_size_macro (MonoTypeEnum type)
{
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return TYPE_I8;
case MONO_TYPE_I2:
case MONO_TYPE_U2:
return TYPE_I16;
case MONO_TYPE_I4:
case MONO_TYPE_U4:
return TYPE_I32;
case MONO_TYPE_I8:
case MONO_TYPE_U8:
return TYPE_I64;
case MONO_TYPE_I:
case MONO_TYPE_U:
#if TARGET_SIZEOF_VOID_P == 8
return TYPE_I64;
#else
return TYPE_I32;
#endif
case MONO_TYPE_R4:
return TYPE_F32;
case MONO_TYPE_R8:
return TYPE_F64;
default:
g_assert_not_reached ();
}
}

/* The maximum length is 2 instructions */
static guint8*
emit_imm (guint8 *code, int dreg, int imm)
Expand Down Expand Up @@ -962,6 +1021,33 @@ emit_xextract_r8 (guint8* code, int dreg, int sreg1, int sreg2)
return ret;
}

static guint8*
emit_xinsert_i8_r8 (guint8* code, MonoTypeEnum type, int dreg, int src_reg, int repl_reg, int index_reg)
{
guint8* ret = code;
gboolean is_float = is_type_float_macro (type);
int extra_code = 0;

if (dreg != src_reg) {
arm_neon_mov (ret, dreg, src_reg);
extra_code = 4;
}

arm_cbnzw (ret, index_reg, code + 12 + extra_code);

if (is_float) {
arm_neon_ins_e (ret, TYPE_I64, dreg, repl_reg, 0, 0);
arm_b (ret, code + 16 + extra_code);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
arm_neon_ins_e (ret, TYPE_I64, dreg, repl_reg, 1, 0);
} else {
arm_neon_ins_g (ret, TYPE_I64, dreg, repl_reg, 0);
arm_b (ret, code + 16 + extra_code);
arm_neon_ins_g (ret, TYPE_I64, dreg, repl_reg, 1);
}

return ret; // max. 5 instructions generated = 20 Bytes
}

static guint8*
emit_call (MonoCompile *cfg, guint8* code, MonoJumpInfoType patch_type, gconstpointer data)
{
Expand Down Expand Up @@ -3496,72 +3582,13 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
return code;
}

static gboolean
is_type_float_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8);
}

static gboolean
is_type_unsigned_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8);
}

static int
get_vector_size_macro (MonoInst *ins)
{
g_assert (ins->klass);
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
return VREG_FULL;
case 8:
return VREG_LOW;
default:
g_assert_not_reached ();
}
}

static int
get_type_size_macro (MonoTypeEnum type)
{
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return TYPE_I8;
case MONO_TYPE_I2:
case MONO_TYPE_U2:
return TYPE_I16;
case MONO_TYPE_I4:
case MONO_TYPE_U4:
return TYPE_I32;
case MONO_TYPE_I8:
case MONO_TYPE_U8:
return TYPE_I64;
case MONO_TYPE_I:
case MONO_TYPE_U:
#if TARGET_SIZEOF_VOID_P == 8
return TYPE_I64;
#else
return TYPE_I32;
#endif
case MONO_TYPE_R4:
return TYPE_F32;
case MONO_TYPE_R8:
return TYPE_F64;
default:
g_assert_not_reached ();
}
}

void
mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
{
MonoInst *ins;
MonoCallInst *call;
guint8 *code = cfg->native_code + cfg->code_len;
int start_offset, max_len, dreg, sreg1, sreg2;
int start_offset, max_len, dreg, sreg1, sreg2, sreg3;
target_mgreg_t imm;

if (cfg->verbose_level > 2)
Expand All @@ -3588,6 +3615,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
dreg = ins->dreg;
sreg1 = ins->sreg1;
sreg2 = ins->sreg2;
sreg3 = ins->sreg3;
imm = ins->inst_imm;

if (opcode_simd_status [ins->opcode - OP_START] == OPCODE_SIMD)
Expand Down Expand Up @@ -3869,7 +3897,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_XEXTRACT_I8:
code = emit_xextract_i8 (code, dreg, sreg1, sreg2);
break;

case OP_XEXTRACT_R8:
code = emit_xextract_r8 (code, dreg, sreg1, sreg2);
break;
Expand All @@ -3881,12 +3908,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
g_assert_not_reached ();
break;

case OP_XINSERT_I1:
case OP_XINSERT_I2:
case OP_XINSERT_I4:
case OP_XINSERT_R4:
g_assert_not_reached();
break;
case OP_XINSERT_I8:
code = emit_xinsert_i8_r8 (code, MONO_TYPE_I8, dreg, sreg1, sreg2, sreg3);
break;
case OP_XINSERT_R8:
code = emit_xinsert_i8_r8 (code, MONO_TYPE_R8, dreg, sreg1, sreg2, sreg3);
break;

case OP_INSERT_I1:
case OP_INSERT_I2:
case OP_INSERT_I4:
case OP_INSERT_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
if (dreg != sreg1)
arm_neon_mov (code, dreg, sreg1);

arm_neon_ins_g(code, t, dreg, sreg2, ins->inst_c0);
break;
}
case OP_INSERT_R4:
Expand All @@ -3900,7 +3943,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
t = SIZE_8;
break;
}
arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);

if (dreg != sreg1)
arm_neon_mov (code, dreg, sreg1);

arm_neon_ins_e(code, t, dreg, sreg2, ins->inst_c0, 0);
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
break;
}
case OP_ARM64_XTN:
Expand Down
59 changes: 53 additions & 6 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1361,7 +1361,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
case SN_Shuffle:
case SN_ToVector128:
case SN_ToVector128Unsafe:
case SN_WithElement:
return NULL;
default:
break;
Expand Down Expand Up @@ -2043,13 +2042,61 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
elems = 4;
}

if (args [1]->opcode == OP_ICONST) {
// If the index is provably a constant, we can generate vastly better code.
int index = args[1]->inst_c0;

if (index < 0 || index >= elems) {
MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems);
MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException");
}
Comment on lines +2049 to +2052
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the comparison needed here since we know the exception is deterministically going to happen?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. I'll make this an unconditional throw.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we do not have an opcode to unconditionally throw an exception. IMO adding it would be out of this PR's scope. Having the comparison in place should not hurt the normal execution path. So I propose adding an issue for the new opcode and deferring this optimization until it is in place. Are you OK with that? @tannergooding @vargaz


int insert_op = type_to_insert_op (arg0_type);
MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg);
ins->inst_c0 = index;
ins->inst_c1 = arg0_type;
return ins;
}

if (!COMPILE_LLVM(cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) {
return NULL;
}

MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems);
MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException");
int insert_op = type_to_xinsert_op (arg0_type);
MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg);
ins->sreg3 = args [1]->dreg;
ins->inst_c1 = arg0_type;
return ins;

if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) {
int insert_op = type_to_xinsert_op (arg0_type);
MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg);
ins->sreg3 = args [1]->dreg;
ins->inst_c1 = arg0_type;
return ins;
} else {
// Create a blank reg and spill it.
// Overwrite memory with original value.
// Overwrite [spilled + index << elem_size_log2] with replacement value
// Read back.
// TODO: on x86, use a LEA
MonoInst* scratch = emit_xzero (cfg, args [0]->klass);
MonoInst* scratcha;
NEW_VARLOADA_VREG (cfg, scratcha, scratch->dreg, fsig->params [0]);
MONO_ADD_INS (cfg->cbb, scratcha);
MONO_EMIT_NEW_STORE_MEMBASE (cfg, mono_type_to_store_membase (cfg, fsig->params [0]), scratcha->dreg, 0, args [0]->dreg);

int offset_reg = alloc_lreg (cfg);
MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SHL_IMM, offset_reg, args [1]->dreg, type_to_width_log2 (arg0_type));
int addr_reg = alloc_preg (cfg);
MONO_EMIT_NEW_BIALU(cfg, OP_PADD, addr_reg, scratcha->dreg, offset_reg);

MONO_EMIT_NEW_STORE_MEMBASE (cfg, mono_type_to_store_membase (cfg, fsig->params [2]), addr_reg, 0, args [2]->dreg);

MonoInst* ret;
NEW_LOAD_MEMBASE (cfg, ret, mono_type_to_load_membase (cfg, fsig->ret), scratch->dreg, scratcha->dreg, 0);
MONO_ADD_INS (cfg->cbb, ret);

return ret;
}
break;
}
case SN_WidenLower:
case SN_WidenUpper: {
Expand Down