From 7575d26bbb7de01143d610d8944830211ceab126 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Wed, 15 Mar 2023 14:14:08 +0100 Subject: [PATCH 01/10] xxAny, xAll comparisons in progress. --- src/mono/mono/mini/mini-arm64.c | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 97b2d1dd6f4d5..82cdf0fb01119 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -36,6 +36,8 @@ #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__)) #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2 #define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1 +#define OPFMT_WTDSI _w, _t, _dreg, sreg1, _i +#define OPFMT_TDSI _t, _dreg, sreg1, _i #define _UNDEF(...) g_assert_not_reached () #define SIMD_OP_CODE(reg_w, op, c) ((reg_w << 31) | (op) << 16 | (c)) #define VREG_64 VREG_LOW @@ -3349,6 +3351,27 @@ emit_move_return_value (MonoCompile *cfg, guint8 * code, MonoInst *ins) return code; } +static guint8* +emit_idiom_xextrmask_i8 (guint8* code, int mode, int dreg, int sreg1) +{ + switch (mode) { + case SIMD_EXTRMASKL_FAST16: + arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG, sreg1, 0); + arm_neon_umov_d (code, dreg, FP_TEMP_REG, 0); + break; + case SIMD_EXTRMASKL_FAST8: + arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG, sreg1, 8); + arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG2, sreg1, 0); + arm_neon_sli (code, VREG_LOW, TYPE_I8, FP_TEMP_REG2, FP_TEMP_REG, 4); + arm_neon_umov_d (code, dreg, FP_TEMP_REG, 0); + break; + default; + g_assert_not_reached (); + } + + return code; +} + /* * emit_branch_island: * @@ -3485,6 +3508,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) const int _t = get_type_size_macro (ins->inst_c1); const gboolean _f = is_type_float_macro (ins->inst_c1); const int _w = get_vector_size_macro (ins); + const int _i = ins->backend.shift_amount; #undef SIMD_OP #define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \ @@ -3502,6 +3526,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) goto after_instruction_emit; } + here: switch (ins->opcode) { case OP_ICONST: code = emit_imm (code, dreg, ins->inst_c0); @@ -3754,6 +3779,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XZERO: arm_neon_eor_16b (code, dreg, dreg, dreg); break; + case OP_EXTRACT_I1: + arm_neon_umov_b (code, dreg, sreg1, ins->inst_c0); + break; + case OP_EXTRACT_I2: + arm_neon_umov_h (code, dreg, sreg1, ins->inst_c0); + break; + case OP_EXTRACT_I4: + arm_neon_umov_s (code, dreg, sreg1, ins->inst_c0); + break; + case OP_EXTRACT_I8: + arm_neon_umov_d (code, dreg, sreg1, ins->inst_c0); + break; + case OP_XETRMASK_I8: + code = emit_idiom_xextrmask_i8 (code, ins->inst_c0, dreg, sreg1); + break; /* ALU */ case OP_IADD: From f317b0af848e27ec686df1ec2bea6db3a2243068 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Wed, 15 Mar 2023 14:14:50 +0100 Subject: [PATCH 02/10] xxAny, xxAll comparisons, part 2. --- src/mono/mono/arch/arm64/arm64-codegen.h | 46 ++++++++++-------------- src/mono/mono/mini/mini-ops.h | 5 +++ src/mono/mono/mini/simd-intrinsics.c | 11 +++--- src/mono/sample/HelloWorld/Program.cs | 12 +++++++ 4 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 706b8e3397207..02f3b5e8b07c3 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1139,19 +1139,16 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_dup_g_4s(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) #define arm_neon_dup_g_2d(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) -// the opcode is smov, but we define variants smovs and smovd by whether they fill a 32 or 64-bit reg. -#define arm_neon_smovs_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) -#define arm_neon_smovs_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smovd_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) -#define arm_neon_smovd_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smovd_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) - -// the opcode is umov, but we define variants smovs and smovd by whether they fill a 32 or 64-bit reg. -#define arm_neon_umovs_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) -#define arm_neon_umovs_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) -#define arm_neon_umovd_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) -#define arm_neon_umovd_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) -#define arm_neon_umovd_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) +#define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) +#define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) +#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) +#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) + +#define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) +#define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) +#define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) +#define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b01000 | ((index) << 4), 0b0111, (rd), (rn)) + /* NEON :: 3-register same FP16 */ // TODO @@ -2316,6 +2313,15 @@ arm_encode_arith_imm (int imm, guint32 *shift) arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \ } while (0) +#define arm_neon_shimm_shl_immh_immb(size, shift) (((shift) + (8 << (size))) & 0b01111111) +#define arm_neon_shimm_shl_opcode(p, q, u, size, opcode, rd, rn, shift) do { \ + int32_t ___temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \ + arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \ +} while (0) + +#define arm_neon_sli(p, width, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn), (shift)) +#define arm_neon_shrn(p, type, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, (type), 0b10000, (rd), (rn), (shift)) + #define arm_neon_sshr_8b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift)) #define arm_neon_sshr_16b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift)) #define arm_neon_sshr_4h(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b00000, (rd), (rn), (shift)) @@ -2348,12 +2354,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_srsra_4s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b00110, (rd), (rn), (shift)) #define arm_neon_srsra_2d(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b00110, (rd), (rn), (shift)) -#define arm_neon_shimm_shl_immh_immb(size, shift) (((shift) + (8 << (size))) & 0b01111111) -#define arm_neon_shimm_shl_opcode(p, q, u, size, opcode, rd, rn, shift) do { \ - int32_t ___temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \ - arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \ -} while (0) - #define arm_neon_shl_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b01010, (rd), (rn), (shift)) #define arm_neon_shl_16b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b01010, (rd), (rn), (shift)) #define arm_neon_shl_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b01010, (rd), (rn), (shift)) @@ -2457,14 +2457,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sri_4s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b01000, (rd), (rn), (shift)) #define arm_neon_sri_2d(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_8, 0b01000, (rd), (rn), (shift)) -#define arm_neon_sli_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_16b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_8h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_4s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b01010, (rd), (rn), (shift)) -#define arm_neon_sli_2d(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_8, 0b01010, (rd), (rn), (shift)) - #define arm_neon_sqshlu_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b01100, (rd), (rn), (shift)) #define arm_neon_sqshlu_16b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b01100, (rd), (rn), (shift)) #define arm_neon_sqshlu_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b01100, (rd), (rn), (shift)) diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 020e446a4bf78..50f18948276b1 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1479,6 +1479,10 @@ MINI_OP(OP_XCOMPARE_SCALAR, "xcompare_scalar", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP_SCALAR, "xcompare_fp_scalar", XREG, XREG, XREG) +/* Extract mask from XREG into LREG. + * inst_c0 - specific instruction, one of SIMD_EXTRMASKL_... */ +MINI_OP(OP_XEXTRMASK_I8, "xextrmask_i8", LREG, XREG, NONE) + /* * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. */ @@ -1486,6 +1490,7 @@ MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG) MINI_OP(OP_XBINOP_FORCEINT, "xbinop_forceint", XREG, XREG, XREG) MINI_OP(OP_XBINOP_SCALAR, "xbinop_scalar", XREG, XREG, XREG) MINI_OP(OP_XBINOP_BYSCALAR, "xbinop_byscalar", XREG, XREG, XREG) + /* inst_c0 contains an INTRINS_ enum, inst_c1 might contain additional data */ MINI_OP(OP_XOP, "xop", NONE, NONE, NONE) MINI_OP(OP_XOP_X_I, "xop_x_i", XREG, IREG, NONE) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 5054570abc438..bc6432655ba2f 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -19,7 +19,7 @@ #include #include -#if defined (MONO_ARCH_SIMD_INTRINSICS) +#if TRUE//defined (MONO_ARCH_SIMD_INTRINSICS) #if defined(DISABLE_JIT) @@ -1190,10 +1190,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 -#ifdef TARGET_ARM64 - if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) - return NULL; -#endif +//#ifdef TARGET_ARM64 +// if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) +// return NULL; +//#endif int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); if (id == -1) { @@ -1216,6 +1216,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_LessThanOrEqual: case SN_Negate: case SN_OnesComplement: + case SN_GreaterThanAny: break; default: return NULL; diff --git a/src/mono/sample/HelloWorld/Program.cs b/src/mono/sample/HelloWorld/Program.cs index 0a65da4203a6d..76bd808a13cfd 100644 --- a/src/mono/sample/HelloWorld/Program.cs +++ b/src/mono/sample/HelloWorld/Program.cs @@ -2,11 +2,21 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; namespace HelloWorld { internal class Program { + [MethodImpl(MethodImplOptions.NoInlining)] + private static int X() + { + Vector128 a = Vector128.Create(2,2,3,4); + Vector128 b = Vector128.Create(1,2,3,4); + return Vector128.GreaterThanAny(a,b) ? 1 : 0; + } + private static void Main(string[] args) { bool isMono = typeof(object).Assembly.GetType("Mono.RuntimeStructs") != null; @@ -14,6 +24,8 @@ private static void Main(string[] args) Console.WriteLine(typeof(object).Assembly.FullName); Console.WriteLine(System.Reflection.Assembly.GetEntryAssembly ()); Console.WriteLine(System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription); + + Console.WriteLine(X().ToString()); } } } From 3354214dfdcf0d16f704b7e40e1f5feb02a0ca1f Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 16 Mar 2023 15:13:05 +0100 Subject: [PATCH 03/10] [mono][jit] Adding compare all/any operations. Fixed umov,smov macros. --- src/mono/mono/arch/arm64/arm64-codegen.h | 31 +++----- src/mono/mono/mini/cpu-arm64.mdesc | 1 + src/mono/mono/mini/mini-arm64.c | 26 +++---- src/mono/mono/mini/mini-ops.h | 6 +- src/mono/mono/mini/mini.h | 5 ++ src/mono/mono/mini/simd-intrinsics.c | 93 +++++++++++++++--------- 6 files changed, 89 insertions(+), 73 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 02f3b5e8b07c3..e51448c58a70a 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1139,15 +1139,15 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_dup_g_4s(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) #define arm_neon_dup_g_2d(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) -#define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) -#define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) -#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) +#define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) +#define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) +#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) +#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) -#define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) -#define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) -#define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) -#define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b01000 | ((index) << 4), 0b0111, (rd), (rn)) +#define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) +#define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) +#define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) +#define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0111, (rd), (rn)) /* NEON :: 3-register same FP16 */ @@ -1573,6 +1573,9 @@ arm_encode_arith_imm (int imm, guint32 *shift) /* NEON :: across lanes */ #define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn)) +#define arm_neon_umaxv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn)) +#define arm_neon_uminv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn)) + // contrary to most other opcodes, the suffix is the type of source #define arm_neon_saddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00011, (rd), (rn)) #define arm_neon_saddlv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00011, (rd), (rn)) @@ -1606,18 +1609,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_uaddlv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b00011, (rd), (rn)) #define arm_neon_uaddlv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b00011, (rd), (rn)) -#define arm_neon_umaxv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b01010, (rd), (rn)) -#define arm_neon_umaxv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b01010, (rd), (rn)) -#define arm_neon_umaxv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b01010, (rd), (rn)) -#define arm_neon_umaxv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b01010, (rd), (rn)) -#define arm_neon_umaxv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b01010, (rd), (rn)) - -#define arm_neon_uminv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn)) -#define arm_neon_uminv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn)) -#define arm_neon_uminv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b11010, (rd), (rn)) -#define arm_neon_uminv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn)) -#define arm_neon_uminv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b11010, (rd), (rn)) - #define arm_neon_fmaxnmv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b01100, (rd), (rn)) #define arm_neon_fmaxv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b01111, (rd), (rn)) #define arm_neon_fminnmv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_1, 0b01100, (rd), (rn)) diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index 0529aef1b605d..d031a32b7cfed 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -503,6 +503,7 @@ xcompare: dest:x src1:x src2:x len:4 xcompare_fp: dest:x src1:x src2:x len:4 negate: dest:x src1:x len:4 ones_complement: dest:x src1:x len:4 +xextract: dest:i src1:x len:8 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 82cdf0fb01119..0460c28771ccd 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3352,20 +3352,18 @@ emit_move_return_value (MonoCompile *cfg, guint8 * code, MonoInst *ins) } static guint8* -emit_idiom_xextrmask_i8 (guint8* code, int mode, int dreg, int sreg1) +emit_xextract (guint8* code, int mode, int dreg, int sreg1) { switch (mode) { - case SIMD_EXTRMASKL_FAST16: - arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG, sreg1, 0); - arm_neon_umov_d (code, dreg, FP_TEMP_REG, 0); + case SIMD_EXTR_MAX8: + arm_neon_umaxv (code, VREG_FULL, TYPE_I8, FP_TEMP_REG, sreg1); + arm_neon_umov_b (code, dreg, FP_TEMP_REG, 0); break; - case SIMD_EXTRMASKL_FAST8: - arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG, sreg1, 8); - arm_neon_shrn (code, TYPE_I8, FP_TEMP_REG2, sreg1, 0); - arm_neon_sli (code, VREG_LOW, TYPE_I8, FP_TEMP_REG2, FP_TEMP_REG, 4); - arm_neon_umov_d (code, dreg, FP_TEMP_REG, 0); + case SIMD_EXTR_MIN8: + arm_neon_uminv (code, VREG_FULL, TYPE_I8, FP_TEMP_REG, sreg1); + arm_neon_umov_b (code, dreg, FP_TEMP_REG, 0); break; - default; + default: g_assert_not_reached (); } @@ -3508,7 +3506,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) const int _t = get_type_size_macro (ins->inst_c1); const gboolean _f = is_type_float_macro (ins->inst_c1); const int _w = get_vector_size_macro (ins); - const int _i = ins->backend.shift_amount; #undef SIMD_OP #define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \ @@ -3525,8 +3522,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) goto after_instruction_emit; } - - here: + switch (ins->opcode) { case OP_ICONST: code = emit_imm (code, dreg, ins->inst_c0); @@ -3791,8 +3787,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_EXTRACT_I8: arm_neon_umov_d (code, dreg, sreg1, ins->inst_c0); break; - case OP_XETRMASK_I8: - code = emit_idiom_xextrmask_i8 (code, ins->inst_c0, dreg, sreg1); + case OP_XEXTRACT: + code = emit_xextract (code, ins->inst_c0, dreg, sreg1); break; /* ALU */ diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 50f18948276b1..1303bfebf9ff8 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1479,9 +1479,9 @@ MINI_OP(OP_XCOMPARE_SCALAR, "xcompare_scalar", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP_SCALAR, "xcompare_fp_scalar", XREG, XREG, XREG) -/* Extract mask from XREG into LREG. - * inst_c0 - specific instruction, one of SIMD_EXTRMASKL_... */ -MINI_OP(OP_XEXTRMASK_I8, "xextrmask_i8", LREG, XREG, NONE) +/* Extract from XREG into IREG. + * inst_c0 - specific instruction, one of SIMD_EXTR_... */ +MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE) /* * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 6a4680768baf2..9228787bae15f 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2926,6 +2926,11 @@ enum { SIMD_PREFETCH_MODE_2, }; +enum { + SIMD_EXTR_MAX8, // extract unsigned maximum of all bytes in XREG into an IREG, good for "is any byte nonzero" + SIMD_EXTR_MIN8 // extract unsigned minumum of all bytes in XREG into an IREG, good for "are all bytes nonzero" +}; + int mini_primitive_type_size (MonoTypeEnum type); MonoTypeEnum mini_get_simd_type_info (MonoClass *klass, guint32 *nelems); diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index bc6432655ba2f..799797125f219 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -19,7 +19,7 @@ #include #include -#if TRUE//defined (MONO_ARCH_SIMD_INTRINSICS) +#if defined (MONO_ARCH_SIMD_INTRINSICS) #if defined(DISABLE_JIT) @@ -1201,7 +1201,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; } - if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256")) + if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512")) return NULL; // TODO: Fix Vector256.WithUpper/WithLower // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 @@ -1216,7 +1216,16 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_LessThanOrEqual: case SN_Negate: case SN_OnesComplement: + case SN_EqualsAny: case SN_GreaterThanAny: + case SN_GreaterThanOrEqualAny: + case SN_LessThanAny: + case SN_LessThanOrEqualAny: + case SN_EqualsAll: + case SN_GreaterThanAll: + case SN_GreaterThanOrEqualAll: + case SN_LessThanAll: + case SN_LessThanOrEqualAll: break; default: return NULL; @@ -1472,18 +1481,26 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_element_type_primitive (fsig->params [0])) return NULL; MonoClass *arg_class = mono_class_from_mono_type_internal (fsig->params [0]); - switch (id) { - case SN_Equals: - return emit_xcompare (cfg, klass, arg0_type, args [0], args [1]); - case SN_EqualsAll: - return emit_xequal (cfg, arg_class, args [0], args [1]); - case SN_EqualsAny: { - MonoInst *cmp_eq = emit_xcompare (cfg, arg_class, arg0_type, args [0], args [1]); - MonoInst *zero = emit_xzero (cfg, arg_class); - return emit_not_xequal (cfg, arg_class, cmp_eq, zero); + if (id == SN_Equals) + return emit_xcompare (cfg, klass, arg0_type, args [0], args [1]); + + if (COMPILE_LLVM (cfg)) { + switch (id) { + case SN_EqualsAll: + return emit_xequal (cfg, arg_class, args [0], args [1]); + case SN_EqualsAny: { + MonoInst *cmp_eq = emit_xcompare (cfg, arg_class, arg0_type, args [0], args [1]); + MonoInst *zero = emit_xzero (cfg, arg_class); + return emit_not_xequal (cfg, arg_class, cmp_eq, zero); + } } - default: g_assert_not_reached (); + } else { + MonoInst* cmp = emit_xcompare (cfg, arg_class, arg0_type, args [0], args [1]); + MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); + ret->inst_c0 = (id == SN_EqualsAll) ? SIMD_EXTR_MIN8 : SIMD_EXTR_MAX8; + return ret; } + g_assert_not_reached (); } case SN_ExtractMostSignificantBits: { if (!is_element_type_primitive (fsig->params [0]) || type_enum_is_float (arg0_type)) @@ -1551,34 +1568,39 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi fsig->ret->type == MONO_TYPE_BOOLEAN && mono_metadata_type_equal (fsig->params [0], fsig->params [1])); - MonoInst *cmp = emit_xcompare_for_intrinsic (cfg, klass, id, arg0_type, args [0], args [1]); - MonoClass *arg_class = mono_class_from_mono_type_internal (fsig->params [0]); - + gboolean is_all = FALSE; switch (id) { case SN_GreaterThanAll: case SN_GreaterThanOrEqualAll: case SN_LessThanAll: - case SN_LessThanOrEqualAll: { - // for floating point numbers all ones is NaN and so - // they must be treated differently than integer types - if (type_enum_is_float (arg0_type)) { + case SN_LessThanOrEqualAll: + is_all = TRUE; + break; + } + + MonoClass *arg_class = mono_class_from_mono_type_internal (fsig->params [0]); + if (COMPILE_LLVM (cfg)) { + MonoInst *cmp = emit_xcompare_for_intrinsic (cfg, klass, id, arg0_type, args [0], args [1]); + if (is_all) { + // for floating point numbers all ones is NaN and so + // they must be treated differently than integer types + if (type_enum_is_float (arg0_type)) { + MonoInst *zero = emit_xzero (cfg, arg_class); + MonoInst *inverted_cmp = emit_xcompare (cfg, klass, arg0_type, cmp, zero); + return emit_xequal (cfg, arg_class, inverted_cmp, zero); + } + + MonoInst *ones = emit_xones (cfg, arg_class); + return emit_xequal (cfg, arg_class, cmp, ones); + } else { MonoInst *zero = emit_xzero (cfg, arg_class); - MonoInst *inverted_cmp = emit_xcompare (cfg, klass, arg0_type, cmp, zero); - return emit_xequal (cfg, arg_class, inverted_cmp, zero); + return emit_not_xequal (cfg, arg_class, cmp, zero); } - - MonoInst *ones = emit_xones (cfg, arg_class); - return emit_xequal (cfg, arg_class, cmp, ones); - } - case SN_GreaterThanAny: - case SN_GreaterThanOrEqualAny: - case SN_LessThanAny: - case SN_LessThanOrEqualAny: { - MonoInst *zero = emit_xzero (cfg, arg_class); - return emit_not_xequal (cfg, arg_class, cmp, zero); - } - default: - g_assert_not_reached (); + } else { + MonoInst *cmp = emit_xcompare_for_intrinsic (cfg, arg_class, id, arg0_type, args [0], args [1]); + MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); + ret->inst_c0 = is_all ? SIMD_EXTR_MIN8 : SIMD_EXTR_MAX8; + return ret; } } case SN_Narrow: { @@ -1873,7 +1895,8 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #if defined(TARGET_AMD64) || defined(TARGET_WASM) if (!COMPILE_LLVM (cfg)) - return NULL; + return NULL; + } #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 From c239967350ee2e9aff91c468d02e4e167d9aff9a Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 16 Mar 2023 15:37:34 +0100 Subject: [PATCH 04/10] Removed superfluous changes. --- src/mono/mono/mini/mini-arm64.c | 14 -------------- src/mono/sample/HelloWorld/Program.cs | 14 +------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 94c65386ef36e..0599760673cae 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -37,8 +37,6 @@ #define OPFMT_WDSS _w, dreg, sreg1, sreg2 #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2 #define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1 -#define OPFMT_WTDSI _w, _t, _dreg, sreg1, _i -#define OPFMT_TDSI _t, _dreg, sreg1, _i #define _UNDEF(...) g_assert_not_reached () #define SIMD_OP_CODE(reg_w, op, c) ((reg_w << 31) | (op) << 16 | (c)) #define VREG_64 VREG_LOW @@ -3776,18 +3774,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XZERO: arm_neon_eor_16b (code, dreg, dreg, dreg); break; - case OP_EXTRACT_I1: - arm_neon_umov_b (code, dreg, sreg1, ins->inst_c0); - break; - case OP_EXTRACT_I2: - arm_neon_umov_h (code, dreg, sreg1, ins->inst_c0); - break; - case OP_EXTRACT_I4: - arm_neon_umov_s (code, dreg, sreg1, ins->inst_c0); - break; - case OP_EXTRACT_I8: - arm_neon_umov_d (code, dreg, sreg1, ins->inst_c0); - break; case OP_XEXTRACT: code = emit_xextract (code, ins->inst_c0, dreg, sreg1); break; diff --git a/src/mono/sample/HelloWorld/Program.cs b/src/mono/sample/HelloWorld/Program.cs index 76bd808a13cfd..6d0b31d30dc6f 100644 --- a/src/mono/sample/HelloWorld/Program.cs +++ b/src/mono/sample/HelloWorld/Program.cs @@ -2,21 +2,11 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; namespace HelloWorld { internal class Program { - [MethodImpl(MethodImplOptions.NoInlining)] - private static int X() - { - Vector128 a = Vector128.Create(2,2,3,4); - Vector128 b = Vector128.Create(1,2,3,4); - return Vector128.GreaterThanAny(a,b) ? 1 : 0; - } - private static void Main(string[] args) { bool isMono = typeof(object).Assembly.GetType("Mono.RuntimeStructs") != null; @@ -24,8 +14,6 @@ private static void Main(string[] args) Console.WriteLine(typeof(object).Assembly.FullName); Console.WriteLine(System.Reflection.Assembly.GetEntryAssembly ()); Console.WriteLine(System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription); - - Console.WriteLine(X().ToString()); } } -} +} \ No newline at end of file From 6234ae4bc9f5e5b852c2f1926754a4f2ae2f9e59 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 16 Mar 2023 15:40:00 +0100 Subject: [PATCH 05/10] Restored newline at the end of HelloWorld. --- src/mono/sample/HelloWorld/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/sample/HelloWorld/Program.cs b/src/mono/sample/HelloWorld/Program.cs index 6d0b31d30dc6f..0a65da4203a6d 100644 --- a/src/mono/sample/HelloWorld/Program.cs +++ b/src/mono/sample/HelloWorld/Program.cs @@ -16,4 +16,4 @@ private static void Main(string[] args) Console.WriteLine(System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription); } } -} \ No newline at end of file +} From 3b75c85f3bd57b7af54de6d0c2d7163c5fdaef1e Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 16 Mar 2023 15:41:30 +0100 Subject: [PATCH 06/10] Fixed unmatched brace. --- src/mono/mono/mini/simd-intrinsics.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index fad76160ff06b..c8b2ea327380c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1900,7 +1900,6 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #if defined(TARGET_AMD64) || defined(TARGET_WASM) if (!COMPILE_LLVM (cfg)) return NULL; - } #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 From cc0e0974aff185e59a460c2088419cf324fb0e00 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 16 Mar 2023 15:43:09 +0100 Subject: [PATCH 07/10] Indentation. --- src/mono/mono/mini/simd-intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index c8b2ea327380c..b9ac0f88b8170 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1899,7 +1899,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #if defined(TARGET_AMD64) || defined(TARGET_WASM) if (!COMPILE_LLVM (cfg)) - return NULL; + return NULL; #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 From 149a797d125ed776a099f4019884375ea37fd6ad Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 17 Mar 2023 14:01:34 +0100 Subject: [PATCH 08/10] Normalized boolean values to 0/1. SIMD_EXTR_ constants have friendlier names. Equality/Inequality are now also intrinsics. --- src/mono/mono/mini/cpu-arm64.mdesc | 2 +- src/mono/mono/mini/mini-arm64.c | 14 ++++++++------ src/mono/mono/mini/mini.h | 4 ++-- src/mono/mono/mini/simd-intrinsics.c | 26 ++++++++++++++++++-------- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index fba5341dbb1ab..5e0b36df24ff9 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -503,7 +503,7 @@ xcompare: dest:x src1:x src2:x len:4 xcompare_fp: dest:x src1:x src2:x len:4 negate: dest:x src1:x len:4 ones_complement: dest:x src1:x len:4 -xextract: dest:i src1:x len:8 +xextract: dest:i src1:x len:12 xbinop_forceint: dest:x src1:x src2:x len:4 generic_class_init: src1:a len:44 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 0599760673cae..9986f7dfbaeb1 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3351,16 +3351,18 @@ emit_move_return_value (MonoCompile *cfg, guint8 * code, MonoInst *ins) } static guint8* -emit_xextract (guint8* code, int mode, int dreg, int sreg1) +emit_xextract (guint8* code, int width, int mode, int dreg, int sreg1) { switch (mode) { - case SIMD_EXTR_MAX8: - arm_neon_umaxv (code, VREG_FULL, TYPE_I8, FP_TEMP_REG, sreg1); + case SIMD_EXTR_IS_ANY_SET: + arm_neon_umaxv (code, width, TYPE_I8, FP_TEMP_REG, sreg1); arm_neon_umov_b (code, dreg, FP_TEMP_REG, 0); + arm_lsrw(code, dreg, dreg, 7); // dreg contains 0xff for TRUE or 0x0 for FALSE, normalize to 0x1/0x0 break; - case SIMD_EXTR_MIN8: - arm_neon_uminv (code, VREG_FULL, TYPE_I8, FP_TEMP_REG, sreg1); + case SIMD_EXTR_ARE_ALL_SET: + arm_neon_uminv (code, width, TYPE_I8, FP_TEMP_REG, sreg1); arm_neon_umov_b (code, dreg, FP_TEMP_REG, 0); + arm_lsrw(code, dreg, dreg, 7); break; default: g_assert_not_reached (); @@ -3775,7 +3777,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) arm_neon_eor_16b (code, dreg, dreg, dreg); break; case OP_XEXTRACT: - code = emit_xextract (code, ins->inst_c0, dreg, sreg1); + code = emit_xextract (code, (ins->inst_c1 == 16) ? VREG_FULL : VREG_LOW, ins->inst_c0, dreg, sreg1); break; /* ALU */ diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 6f14f2ac855bb..943242eadb494 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2934,8 +2934,8 @@ enum { }; enum { - SIMD_EXTR_MAX8, // extract unsigned maximum of all bytes in XREG into an IREG, good for "is any byte nonzero" - SIMD_EXTR_MIN8 // extract unsigned minumum of all bytes in XREG into an IREG, good for "are all bytes nonzero" + SIMD_EXTR_IS_ANY_SET, + SIMD_EXTR_ARE_ALL_SET }; int mini_primitive_type_size (MonoTypeEnum type); diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index b9ac0f88b8170..f68ceac06dcf0 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -509,11 +509,17 @@ static MonoInst* emit_xequal (MonoCompile *cfg, MonoClass *klass, MonoInst *arg1, MonoInst *arg2) { #ifdef TARGET_ARM64 - int size = mono_class_value_size (klass, NULL); - if (size == 16) + if (!COMPILE_LLVM (cfg)) { + MonoInst* cmp = emit_xcompare (cfg, arg1->klass, arg1->type, arg1, arg2); + MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); + ret->inst_c0 = SIMD_EXTR_ARE_ALL_SET; + ret->inst_c1 = mono_class_value_size (klass, NULL); + return ret; + } else if (mono_class_value_size (klass, NULL) == 16) { return emit_simd_ins (cfg, klass, OP_XEQUAL_ARM64_V128_FAST, arg1->dreg, arg2->dreg); - else + } else { return emit_simd_ins (cfg, klass, OP_XEQUAL, arg1->dreg, arg2->dreg); + } #else MonoInst *ins = emit_simd_ins (cfg, klass, OP_XEQUAL, arg1->dreg, arg2->dreg); if (!COMPILE_LLVM (cfg)) @@ -1202,8 +1208,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512")) - return NULL; // TODO: Fix Vector256.WithUpper/WithLower - + return NULL; + // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg)) { @@ -1501,7 +1507,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } else { MonoInst* cmp = emit_xcompare (cfg, arg_class, arg0_type, args [0], args [1]); MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); - ret->inst_c0 = (id == SN_EqualsAll) ? SIMD_EXTR_MIN8 : SIMD_EXTR_MAX8; + ret->inst_c0 = (id == SN_EqualsAll) ? SIMD_EXTR_ARE_ALL_SET : SIMD_EXTR_IS_ANY_SET; + ret->inst_c1 = mono_class_value_size (klass, NULL); return ret; } g_assert_not_reached (); @@ -1601,9 +1608,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_not_xequal (cfg, arg_class, cmp, zero); } } else { - MonoInst *cmp = emit_xcompare_for_intrinsic (cfg, arg_class, id, arg0_type, args [0], args [1]); + MonoInst* cmp = emit_xcompare_for_intrinsic (cfg, arg_class, id, arg0_type, args [0], args [1]); MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); - ret->inst_c0 = is_all ? SIMD_EXTR_MIN8 : SIMD_EXTR_MAX8; + ret->inst_c0 = is_all ? SIMD_EXTR_ARE_ALL_SET : SIMD_EXTR_IS_ANY_SET; + ret->inst_c1 = mono_class_value_size (klass, NULL); return ret; } } @@ -1918,6 +1926,8 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign case SN_op_BitwiseAnd: case SN_op_BitwiseOr: case SN_op_ExclusiveOr: + case SN_op_Equality: + case SN_op_Inequality: break; default: return NULL; From 21a185083601c808c6b35755575bdfc8c556af22 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Mon, 20 Mar 2023 11:46:16 +0100 Subject: [PATCH 09/10] Fixed element type for comparisons. --- src/mono/mono/mini/mini-arm64.c | 2 +- src/mono/mono/mini/simd-intrinsics.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 9986f7dfbaeb1..54fa3ca088f22 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3777,7 +3777,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) arm_neon_eor_16b (code, dreg, dreg, dreg); break; case OP_XEXTRACT: - code = emit_xextract (code, (ins->inst_c1 == 16) ? VREG_FULL : VREG_LOW, ins->inst_c0, dreg, sreg1); + code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1); break; /* ALU */ diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index f68ceac06dcf0..3fe658d61d784 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -510,7 +510,8 @@ emit_xequal (MonoCompile *cfg, MonoClass *klass, MonoInst *arg1, MonoInst *arg2) { #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg)) { - MonoInst* cmp = emit_xcompare (cfg, arg1->klass, arg1->type, arg1, arg2); + MonoTypeEnum elemt = get_underlying_type (m_class_get_this_arg (arg1->klass)); + MonoInst* cmp = emit_xcompare (cfg, arg1->klass, elemt, arg1, arg2); MonoInst* ret = emit_simd_ins (cfg, mono_defaults.boolean_class, OP_XEXTRACT, cmp->dreg, -1); ret->inst_c0 = SIMD_EXTR_ARE_ALL_SET; ret->inst_c1 = mono_class_value_size (klass, NULL); From 56aa1ad395b4139956f1a9df6a0ba40501c8b7bf Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Tue, 21 Mar 2023 10:32:36 +0100 Subject: [PATCH 10/10] Temporarily disabled intrinsics. Will be permanenty reenabled once all are implemented. --- src/mono/mono/mini/simd-intrinsics.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index bf0039faebc01..747e6b237b4b8 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1197,10 +1197,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 -//#ifdef TARGET_ARM64 -// if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) -// return NULL; -//#endif +#ifdef TARGET_ARM64 + if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) + return NULL; +#endif int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); if (id == -1) {