diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index cfb6dc74e4efd..1480b38aa72b6 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -521,12 +521,12 @@ expand_i4: dest:x src1:i len:4 expand_i8: dest:x src1:i len:4 expand_r4: dest:x src1:f len:4 expand_r8: dest:x src1:f len:4 -insert_i1: dest:x src1:i len:4 -insert_i2: dest:x src1:i len:4 -insert_i4: dest:x src1:i len:4 -insert_i8: dest:x src1:i len:4 -insert_r4: dest:x src1:f len:4 -insert_r8: dest:x src1:f len:4 +insert_i1: dest:x src1:x src2:i len:8 +insert_i2: dest:x src1:x src2:i len:8 +insert_i4: dest:x src1:x src2:i len:8 +insert_i8: dest:x src1:x src2:i len:8 +insert_r4: dest:x src1:x src2:f len:8 +insert_r8: dest:x src1:x src2:f len:8 create_scalar_int: dest:x src1:i len:8 create_scalar_float: dest:x src1:f len:12 create_scalar_unsafe_int: dest:x src1:i len:4 @@ -542,6 +542,8 @@ arm64_fcvtn2: dest:x src1:x src2:x len:4 clob:1 xunop: dest:x src1:x len:4 arm64_ushl: dest:x src1:x src2:x len:4 arm64_ext_imm: dest:x src1:x src2:x len:4 +xinsert_i8: dest:x src1:x src2:i src3:i len:20 +xinsert_r8: dest:x src1:x src2:f src3:i len:20 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 938b351d3f067..17b5c14995e84 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -408,6 +408,65 @@ mono_arch_finish_init (void) { } +static gboolean +is_type_float_macro (MonoTypeEnum type) +{ + return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); +} + +static gboolean +is_type_unsigned_macro (MonoTypeEnum type) +{ + return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8); +} + +static int +get_vector_size_macro (MonoInst *ins) +{ + g_assert (ins->klass); + int size = mono_class_value_size (ins->klass, NULL); + switch (size) { + case 16: + return VREG_FULL; + case 8: + return VREG_LOW; + default: + g_assert_not_reached (); + } +} + +static int +get_type_size_macro (MonoTypeEnum type) +{ + switch (type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return TYPE_I8; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return TYPE_I16; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return TYPE_I32; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return TYPE_I64; + case MONO_TYPE_I: + case MONO_TYPE_U: +#if TARGET_SIZEOF_VOID_P == 8 + return TYPE_I64; +#else + return TYPE_I32; +#endif + case MONO_TYPE_R4: + return TYPE_F32; + case MONO_TYPE_R8: + return TYPE_F64; + default: + g_assert_not_reached (); + } +} + /* The maximum length is 2 instructions */ static guint8* emit_imm (guint8 *code, int dreg, int imm) @@ -962,6 +1021,33 @@ emit_xextract_r8 (guint8* code, int dreg, int sreg1, int sreg2) return ret; } +static guint8* +emit_xinsert_i8_r8 (guint8* code, MonoTypeEnum type, int dreg, int src_reg, int repl_reg, int index_reg) +{ + guint8* ret = code; + gboolean is_float = is_type_float_macro (type); + int extra_code = 0; + + if (dreg != src_reg) { + arm_neon_mov (ret, dreg, src_reg); + extra_code = 4; + } + + arm_cbnzw (ret, index_reg, code + 12 + extra_code); + + if (is_float) { + arm_neon_ins_e (ret, TYPE_I64, dreg, repl_reg, 0, 0); + arm_b (ret, code + 16 + extra_code); + arm_neon_ins_e (ret, TYPE_I64, dreg, repl_reg, 1, 0); + } else { + arm_neon_ins_g (ret, TYPE_I64, dreg, repl_reg, 0); + arm_b (ret, code + 16 + extra_code); + arm_neon_ins_g (ret, TYPE_I64, dreg, repl_reg, 1); + } + + return ret; // max. 5 instructions generated = 20 Bytes +} + static guint8* emit_call (MonoCompile *cfg, guint8* code, MonoJumpInfoType patch_type, gconstpointer data) { @@ -3496,72 +3582,13 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset) return code; } -static gboolean -is_type_float_macro (MonoTypeEnum type) -{ - return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); -} - -static gboolean -is_type_unsigned_macro (MonoTypeEnum type) -{ - return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8); -} - -static int -get_vector_size_macro (MonoInst *ins) -{ - g_assert (ins->klass); - int size = mono_class_value_size (ins->klass, NULL); - switch (size) { - case 16: - return VREG_FULL; - case 8: - return VREG_LOW; - default: - g_assert_not_reached (); - } -} - -static int -get_type_size_macro (MonoTypeEnum type) -{ - switch (type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: - return TYPE_I8; - case MONO_TYPE_I2: - case MONO_TYPE_U2: - return TYPE_I16; - case MONO_TYPE_I4: - case MONO_TYPE_U4: - return TYPE_I32; - case MONO_TYPE_I8: - case MONO_TYPE_U8: - return TYPE_I64; - case MONO_TYPE_I: - case MONO_TYPE_U: -#if TARGET_SIZEOF_VOID_P == 8 - return TYPE_I64; -#else - return TYPE_I32; -#endif - case MONO_TYPE_R4: - return TYPE_F32; - case MONO_TYPE_R8: - return TYPE_F64; - default: - g_assert_not_reached (); - } -} - void mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) { MonoInst *ins; MonoCallInst *call; guint8 *code = cfg->native_code + cfg->code_len; - int start_offset, max_len, dreg, sreg1, sreg2; + int start_offset, max_len, dreg, sreg1, sreg2, sreg3; target_mgreg_t imm; if (cfg->verbose_level > 2) @@ -3588,6 +3615,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) dreg = ins->dreg; sreg1 = ins->sreg1; sreg2 = ins->sreg2; + sreg3 = ins->sreg3; imm = ins->inst_imm; if (opcode_simd_status [ins->opcode - OP_START] == OPCODE_SIMD) @@ -3869,7 +3897,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XEXTRACT_I8: code = emit_xextract_i8 (code, dreg, sreg1, sreg2); break; - case OP_XEXTRACT_R8: code = emit_xextract_r8 (code, dreg, sreg1, sreg2); break; @@ -3881,12 +3908,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) g_assert_not_reached (); break; + case OP_XINSERT_I1: + case OP_XINSERT_I2: + case OP_XINSERT_I4: + case OP_XINSERT_R4: + g_assert_not_reached(); + break; + case OP_XINSERT_I8: + code = emit_xinsert_i8_r8 (code, MONO_TYPE_I8, dreg, sreg1, sreg2, sreg3); + break; + case OP_XINSERT_R8: + code = emit_xinsert_i8_r8 (code, MONO_TYPE_R8, dreg, sreg1, sreg2, sreg3); + break; + case OP_INSERT_I1: case OP_INSERT_I2: case OP_INSERT_I4: case OP_INSERT_I8: { const int t = get_type_size_macro (ins->inst_c1); - arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0); + if (dreg != sreg1) + arm_neon_mov (code, dreg, sreg1); + + arm_neon_ins_g(code, t, dreg, sreg2, ins->inst_c0); break; } case OP_INSERT_R4: @@ -3900,7 +3943,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) t = SIZE_8; break; } - arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0); + + if (dreg != sreg1) + arm_neon_mov (code, dreg, sreg1); + + arm_neon_ins_e(code, t, dreg, sreg2, ins->inst_c0, 0); break; } case OP_ARM64_XTN: diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 6e6afa17eb34e..c3bb151d5ceb2 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1361,7 +1361,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_Shuffle: case SN_ToVector128: case SN_ToVector128Unsafe: - case SN_WithElement: return NULL; default: break; @@ -2043,13 +2042,61 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi elems = 4; } + if (args [1]->opcode == OP_ICONST) { + // If the index is provably a constant, we can generate vastly better code. + int index = args[1]->inst_c0; + + if (index < 0 || index >= elems) { + MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); + MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); + } + + int insert_op = type_to_insert_op (arg0_type); + MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); + ins->inst_c0 = index; + ins->inst_c1 = arg0_type; + return ins; + } + + if (!COMPILE_LLVM(cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) { + return NULL; + } + MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - int insert_op = type_to_xinsert_op (arg0_type); - MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); - ins->sreg3 = args [1]->dreg; - ins->inst_c1 = arg0_type; - return ins; + + if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) { + int insert_op = type_to_xinsert_op (arg0_type); + MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); + ins->sreg3 = args [1]->dreg; + ins->inst_c1 = arg0_type; + return ins; + } else { + // Create a blank reg and spill it. + // Overwrite memory with original value. + // Overwrite [spilled + index << elem_size_log2] with replacement value + // Read back. + // TODO: on x86, use a LEA + MonoInst* scratch = emit_xzero (cfg, args [0]->klass); + MonoInst* scratcha; + NEW_VARLOADA_VREG (cfg, scratcha, scratch->dreg, fsig->params [0]); + MONO_ADD_INS (cfg->cbb, scratcha); + MONO_EMIT_NEW_STORE_MEMBASE (cfg, mono_type_to_store_membase (cfg, fsig->params [0]), scratcha->dreg, 0, args [0]->dreg); + + int offset_reg = alloc_lreg (cfg); + MONO_EMIT_NEW_BIALU_IMM (cfg, OP_SHL_IMM, offset_reg, args [1]->dreg, type_to_width_log2 (arg0_type)); + int addr_reg = alloc_preg (cfg); + MONO_EMIT_NEW_BIALU(cfg, OP_PADD, addr_reg, scratcha->dreg, offset_reg); + + MONO_EMIT_NEW_STORE_MEMBASE (cfg, mono_type_to_store_membase (cfg, fsig->params [2]), addr_reg, 0, args [2]->dreg); + + MonoInst* ret; + NEW_LOAD_MEMBASE (cfg, ret, mono_type_to_load_membase (cfg, fsig->ret), scratch->dreg, scratcha->dreg, 0); + MONO_ADD_INS (cfg->cbb, ret); + + return ret; + } + break; } case SN_WidenLower: case SN_WidenUpper: {