[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

echesakov · 2021-05-07T01:00:42Z

And use AdvSimd.Arm64.StorePair along with the newly implemented AdvSimd.Arm64.LoadPairVector128 in the following libraries methods:

System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long

@@ -103,9 +106,7 @@ G_M41550_IG03:
             sub     x3, x3, #32
                                                ;; bbWeight=0.50 PerfScore 8.75
 G_M41550_IG04:
-            ld1     {v17.16b}, [x0]
-            add     x4, x0, #16
-            ld1     {v18.16b}, [x4]
+            ldp     q17, q18, [x0]
             sshr    v17.16b, v17.16b, #7
             and     v17.16b, v17.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
@@ -127,7 +128,7 @@ G_M41550_IG04:
             add     x0, x0, #32
             cmp     x0, x3
             bls     G_M41550_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M41550_IG05:
             mov     w4, w1
             tbz     w4, #4, G_M41550_IG07
@@ -245,9 +246,9 @@ G_M41550_IG19:
 RWD00          dq      1001100110011001h, 1001100110011001h

System.Collections.BitArray:CopyTo(Array,int):this

@@ -419,29 +418,22 @@ G_M19747_IG16:
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
             mov     v10.d[1], v13.d[0]
-            and     v16.16b, v10.16b, v16.16b
+            and     v17.16b, v10.16b, v16.16b
             mov     v8.d[1], v12.d[0]
+            umin    v17.16b, v17.16b, v8.16b
+            mov     v9.d[1], v11.d[0]
+            zip2    v18.16b, v9.16b, v9.16b
+            and     v16.16b, v18.16b, v16.16b
             umin    v16.16b, v16.16b, v8.16b
             mov     w0, w21
             add     x0, x25, x0
-            st1     {v16.16b}, [x0]
-            mov     v9.d[1], v11.d[0]
-            zip2    v16.16b, v9.16b, v9.16b
-            movz    x1, #0xd1ffab1e
-            movk    x1, #0xd1ffab1e LSL #16
-            movk    x1, #0xd1ffab1e LSL #32
-            ldr     x1, [x1]
-            ldr     q17, [x1,#8]
-            and     v16.16b, v16.16b, v17.16b
-            umin    v16.16b, v16.16b, v8.16b
-            add     x0, x0, #16
-            st1     {v16.16b}, [x0]
+            stp     q17, q16, [x0]
             add     w21, w21, #32
             add     w0, w21, #32
             ldr     w1, [x19,#16]
             cmp     w0, w1
             bls     G_M19747_IG16
-                                               ;; bbWeight=4    PerfScore 212.00
+                                               ;; bbWeight=4    PerfScore 176.00
 G_M19747_IG17:
             mov     x0, #0
             str     x0, [fp,#16]       // [V40 loc37]
@@ -648,10 +640,10 @@ RWD00     dd      G_M19747_IG14 - G_M19747_IG02
                dd      G_M19747_IG12 - G_M19747_IG02

System.Collections.BitArray:.ctor(ref):this

-; Lcl frame size = 16 +; Lcl frame size = 8 G_M47086_IG01: stp fp, lr, [sp,#-112]! - stp d8, d9, [sp,#32] - stp d10, d11, [sp,#48] - stp x19, x20, [sp,#64] - stp x21, x22, [sp,#80] - stp x23, x24, [sp,#96] + stp d8, d9, [sp,#24] + stp d10, d11, [sp,#40] + stp d12, d13, [sp,#56] + stp x19, x20, [sp,#72] + stp x21, x22, [sp,#88] + str x23, [sp,#104] mov fp, sp - str xzr, [fp,#24] // [V05 loc3] + str xzr, [fp,#16] // [V05 loc3] mov x19, x0 mov x20, x1 - ;; bbWeight=1 PerfScore 8.50 + ;; bbWeight=1 PerfScore 9.50 G_M47086_IG02: cbz x20, G_M47086_IG14 ldr w21, [x20,#8] @@ -745,49 +740,50 @@ G_M47086_IG02: ;; bbWeight=1 PerfScore 14.50 G_M47086_IG03: movi v8.4s, #0x00 - str x20, [fp,#24] // [V05 loc3] - ldr x0, [fp,#24] // [V05 loc3] + str x20, [fp,#16] // [V05 loc3] + ldr x0, [fp,#16] // [V05 loc3] ldr w0, [x0,#8] cbnz w0, G_M47086_IG04 mov x23, #0 b G_M47086_IG06 ;; bbWeight=0.50 PerfScore 4.50 G_M47086_IG04: - ldr x0, [fp,#24] // [V05 loc3] + ldr x0, [fp,#16] // [V05 loc3] ldr w0, [x0,#8] cmp w0, #0 bls G_M47086_IG15 - ldr x0, [fp,#24] // [V05 loc3] + ldr x0, [fp,#16] // [V05 loc3] add x23, x0, #16 cmp w21, #32 blo G_M47086_IG07 ;; bbWeight=0.50 PerfScore 5.25 G_M47086_IG05: mov w0, w22 - add x24, x23, x0 - ld1 {v16.16b}, [x24] - cmeq v9.16b, v16.16b, v8.16b + add x0, x23, x0 + ldp q16, q17, [x0] + mov v9.16b, v17.16b + cmeq v10.16b, v16.16b, v8.16b movz x0, #0xd1ffab1e movk x0, #0xd1ffab1e LSL #16 movk x0, #0xd1ffab1e LSL #32 mov w1, #7 - mov v10.d[0], v8.d[1] - mov v11.d[0], v9.d[1] + mov v11.d[0], v8.d[1] + mov v12.d[0], v10.d[1] + mov v13.d[0], v9.d[1] bl CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE movz x0, #0xd1ffab1e movk x0, #0xd1ffab1e LSL #16 movk x0, #0xd1ffab1e LSL #32 ldr x0, [x0] ldr q16, [x0,#8] - mov v9.d[1], v11.d[0] - and v17.16b, v9.16b, v16.16b + mov v10.d[1], v12.d[0] + and v17.16b, v10.16b, v16.16b addp v17.16b, v17.16b, v17.16b addp v17.16b, v17.16b, v17.16b addp v17.16b, v17.16b, v17.16b - add x0, x24, #16 - ld1 {v18.16b}, [x0] - mov v8.d[1], v10.d[0] - cmeq v18.16b, v18.16b, v8.16b + mov v9.d[1], v13.d[0] + mov v8.d[1], v11.d[0] + cmeq v18.16b, v9.16b, v8.16b and v16.16b, v18.16b, v16.16b addp v16.16b, v16.16b, v16.16b addp v16.16b, v16.16b, v16.16b @@ -805,7 +801,7 @@ G_M47086_IG05: mvn w0, w0 str w0, [x1, x2] add w22, w22, #32 - ;; bbWeight=2 PerfScore 103.00 + ;; bbWeight=2 PerfScore 101.00 G_M47086_IG06: add w0, w22, #32 cmp w21, w0 @@ -813,7 +809,7 @@ G_M47086_IG06: ;; bbWeight=4 PerfScore 8.00 G_M47086_IG07: mov x0, #0 - str x0, [fp,#24] // [V05 loc3] + str x0, [fp,#16] // [V05 loc3] cmp w21, w22 bls G_M47086_IG11 ;; bbWeight=0.50 PerfScore 1.50 @@ -851,14 +847,15 @@ G_M47086_IG11: str wzr, [x19,#20] ;; bbWeight=1 PerfScore 1.00 G_M47086_IG12: - ldp x23, x24, [sp,#96] - ldp x21, x22, [sp,#80] - ldp x19, x20, [sp,#64] - ldp d10, d11, [sp,#48] - ldp d8, d9, [sp,#32] + ldr x23, [sp,#104] + ldp x21, x22, [sp,#88] + ldp x19, x20, [sp,#72] + ldp d12, d13, [sp,#56] + ldp d10, d11, [sp,#40] + ldp d8, d9, [sp,#24] ldp fp, lr, [sp],#112 ret lr - ;; bbWeight=1 PerfScore 7.00 + ;; bbWeight=1 PerfScore 9.00

System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this

@@ -69,10 +73,8 @@ G_M26779_IG03:
 G_M26779_IG04:
             lsl     x6, x4, #1
             add     x6, x1, x6
-            ld1     {v20.8h}, [x6]
+            ldp     q20, q21, [x6]
             sqxtun  v20.8b, v20.8h
-            add     x6, x6, #16
-            ld1     {v21.8h}, [x6]
             sqxtun2 v20.16b, v21.8h
             and     v21.16b, v20.16b, v16.16b
             tbl     v21.16b, {v19.16b}, v21.16b
@@ -87,7 +89,7 @@ G_M26779_IG04:
             add     x4, x4, #16
             cmp     x4, x5
             blo     G_M26779_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M26779_IG05:
             mov     w5, w2
             tbz     w5, #3, G_M26779_IG06
@@ -179,8 +181,8 @@ RWD00       dq      8040201008040201h, 0000000000000000h
 RWD16          dq      F00FF00FF00FF00Fh, F00FF00FF00FF00Fh

Fixes #39243

dotnet-issue-labeler · 2021-05-07T01:00:48Z

Note regarding the new-api-needs-documentation label:

This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, to please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change.

ghost · 2021-05-07T01:00:52Z

Tagging subscribers to this area: @tannergooding
See info in area-owners.md if you want to be subscribed.

Issue Details

And use AdvSimd.Arm64.StorePair along with the newly implemented AdvSimd.Arm64.LoadPairVector128 in the following libraries methods:

System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long

@@ -103,9 +106,7 @@ G_M41550_IG03:
             sub     x3, x3, #32
                                                ;; bbWeight=0.50 PerfScore 8.75
 G_M41550_IG04:
-            ld1     {v17.16b}, [x0]
-            add     x4, x0, #16
-            ld1     {v18.16b}, [x4]
+            ldp     q17, q18, [x0]
             sshr    v17.16b, v17.16b, #7
             and     v17.16b, v17.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
@@ -127,7 +128,7 @@ G_M41550_IG04:
             add     x0, x0, #32
             cmp     x0, x3
             bls     G_M41550_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M41550_IG05:
             mov     w4, w1
             tbz     w4, #4, G_M41550_IG07
@@ -245,9 +246,9 @@ G_M41550_IG19:
 RWD00          dq      1001100110011001h, 1001100110011001h

System.Collections.BitArray:CopyTo(Array,int):this

@@ -419,29 +418,22 @@ G_M19747_IG16:
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
             mov     v10.d[1], v13.d[0]
-            and     v16.16b, v10.16b, v16.16b
+            and     v17.16b, v10.16b, v16.16b
             mov     v8.d[1], v12.d[0]
+            umin    v17.16b, v17.16b, v8.16b
+            mov     v9.d[1], v11.d[0]
+            zip2    v18.16b, v9.16b, v9.16b
+            and     v16.16b, v18.16b, v16.16b
             umin    v16.16b, v16.16b, v8.16b
             mov     w0, w21
             add     x0, x25, x0
-            st1     {v16.16b}, [x0]
-            mov     v9.d[1], v11.d[0]
-            zip2    v16.16b, v9.16b, v9.16b
-            movz    x1, #0xd1ffab1e
-            movk    x1, #0xd1ffab1e LSL #16
-            movk    x1, #0xd1ffab1e LSL #32
-            ldr     x1, [x1]
-            ldr     q17, [x1,#8]
-            and     v16.16b, v16.16b, v17.16b
-            umin    v16.16b, v16.16b, v8.16b
-            add     x0, x0, #16
-            st1     {v16.16b}, [x0]
+            stp     q17, q16, [x0]
             add     w21, w21, #32
             add     w0, w21, #32
             ldr     w1, [x19,#16]
             cmp     w0, w1
             bls     G_M19747_IG16
-                                               ;; bbWeight=4    PerfScore 212.00
+                                               ;; bbWeight=4    PerfScore 176.00
 G_M19747_IG17:
             mov     x0, #0
             str     x0, [fp,#16]       // [V40 loc37]
@@ -648,10 +640,10 @@ RWD00     dd      G_M19747_IG14 - G_M19747_IG02
                dd      G_M19747_IG12 - G_M19747_IG02

System.Collections.BitArray:.ctor(ref):this

-; Lcl frame size = 16
+; Lcl frame size = 8

 G_M47086_IG01:
             stp     fp, lr, [sp,#-112]!
-            stp     d8, d9, [sp,#32]
-            stp     d10, d11, [sp,#48]
-            stp     x19, x20, [sp,#64]
-            stp     x21, x22, [sp,#80]
-            stp     x23, x24, [sp,#96]
+            stp     d8, d9, [sp,#24]
+            stp     d10, d11, [sp,#40]
+            stp     d12, d13, [sp,#56]
+            stp     x19, x20, [sp,#72]
+            stp     x21, x22, [sp,#88]
+            str     x23, [sp,#104]
             mov     fp, sp
-            str     xzr, [fp,#24]      // [V05 loc3]
+            str     xzr, [fp,#16]      // [V05 loc3]
             mov     x19, x0
             mov     x20, x1
-                                               ;; bbWeight=1    PerfScore 8.50
+                                               ;; bbWeight=1    PerfScore 9.50
 G_M47086_IG02:
             cbz     x20, G_M47086_IG14
             ldr     w21, [x20,#8]
@@ -745,49 +740,50 @@ G_M47086_IG02:
                                                ;; bbWeight=1    PerfScore 14.50
 G_M47086_IG03:
             movi    v8.4s, #0x00
-            str     x20, [fp,#24]      // [V05 loc3]
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            str     x20, [fp,#16]      // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cbnz    w0, G_M47086_IG04
             mov     x23, #0
             b       G_M47086_IG06
                                                ;; bbWeight=0.50 PerfScore 4.50
 G_M47086_IG04:
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cmp     w0, #0
             bls     G_M47086_IG15
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             add     x23, x0, #16
             cmp     w21, #32
             blo     G_M47086_IG07
                                                ;; bbWeight=0.50 PerfScore 5.25
 G_M47086_IG05:
             mov     w0, w22
-            add     x24, x23, x0
-            ld1     {v16.16b}, [x24]
-            cmeq    v9.16b, v16.16b, v8.16b
+            add     x0, x23, x0
+            ldp     q16, q17, [x0]
+            mov     v9.16b, v17.16b
+            cmeq    v10.16b, v16.16b, v8.16b
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             mov     w1, #7
-            mov     v10.d[0], v8.d[1]
-            mov     v11.d[0], v9.d[1]
+            mov     v11.d[0], v8.d[1]
+            mov     v12.d[0], v10.d[1]
+            mov     v13.d[0], v9.d[1]
             bl      CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
-            mov     v9.d[1], v11.d[0]
-            and     v17.16b, v9.16b, v16.16b
+            mov     v10.d[1], v12.d[0]
+            and     v17.16b, v10.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
-            add     x0, x24, #16
-            ld1     {v18.16b}, [x0]
-            mov     v8.d[1], v10.d[0]
-            cmeq    v18.16b, v18.16b, v8.16b
+            mov     v9.d[1], v13.d[0]
+            mov     v8.d[1], v11.d[0]
+            cmeq    v18.16b, v9.16b, v8.16b
             and     v16.16b, v18.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
@@ -805,7 +801,7 @@ G_M47086_IG05:
             mvn     w0, w0
             str     w0, [x1, x2]
             add     w22, w22, #32
-                                               ;; bbWeight=2    PerfScore 103.00
+                                               ;; bbWeight=2    PerfScore 101.00
 G_M47086_IG06:
             add     w0, w22, #32
             cmp     w21, w0
@@ -813,7 +809,7 @@ G_M47086_IG06:
                                                ;; bbWeight=4    PerfScore 8.00
 G_M47086_IG07:
             mov     x0, #0
-            str     x0, [fp,#24]       // [V05 loc3]
+            str     x0, [fp,#16]       // [V05 loc3]
             cmp     w21, w22
             bls     G_M47086_IG11
                                                ;; bbWeight=0.50 PerfScore 1.50
@@ -851,14 +847,15 @@ G_M47086_IG11:
             str     wzr, [x19,#20]
                                                ;; bbWeight=1    PerfScore 1.00
 G_M47086_IG12:
-            ldp     x23, x24, [sp,#96]
-            ldp     x21, x22, [sp,#80]
-            ldp     x19, x20, [sp,#64]
-            ldp     d10, d11, [sp,#48]
-            ldp     d8, d9, [sp,#32]
+            ldr     x23, [sp,#104]
+            ldp     x21, x22, [sp,#88]
+            ldp     x19, x20, [sp,#72]
+            ldp     d12, d13, [sp,#56]
+            ldp     d10, d11, [sp,#40]
+            ldp     d8, d9, [sp,#24]
             ldp     fp, lr, [sp],#112
             ret     lr
-                                               ;; bbWeight=1    PerfScore 7.00
+                                               ;; bbWeight=1    PerfScore 9.00

System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this

@@ -69,10 +73,8 @@ G_M26779_IG03:
 G_M26779_IG04:
             lsl     x6, x4, #1
             add     x6, x1, x6
-            ld1     {v20.8h}, [x6]
+            ldp     q20, q21, [x6]
             sqxtun  v20.8b, v20.8h
-            add     x6, x6, #16
-            ld1     {v21.8h}, [x6]
             sqxtun2 v20.16b, v21.8h
             and     v21.16b, v20.16b, v16.16b
             tbl     v21.16b, {v19.16b}, v21.16b
@@ -87,7 +89,7 @@ G_M26779_IG04:
             add     x4, x4, #16
             cmp     x4, x5
             blo     G_M26779_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M26779_IG05:
             mov     w5, w2
             tbz     w5, #3, G_M26779_IG06
@@ -179,8 +181,8 @@ RWD00       dq      8040201008040201h, 0000000000000000h
 RWD16          dq      F00FF00FF00FF00Fh, F00FF00FF00FF00Fh

Author:	echesakovMSFT
Assignees:	-
Labels:	`area-System.Runtime.Intrinsics`, `new-api-needs-documentation`
Milestone:	-

am11 · 2021-05-07T22:10:03Z

System.Collections.BitArray:.ctor(ref):this

seems to have a slight regression. Is keeping that method to continue using LoadVector128 (no pair) better?

echesakov · 2021-05-10T16:53:14Z

System.Collections.BitArray:.ctor(ref):this

seems to have a slight regression. Is keeping that method to continue using LoadVector128 (no pair) better?

Agree, the regression is due to register movements around the call. I will update the changes after fixing the assertion that the CI testing has revealed.

ghost · 2021-06-09T22:00:16Z

Draft Pull Request was automatically closed for inactivity. Please let us know if you'd like to reopen it.

…formNotSupported.cs

…dvSimd.cs AdvSimd.PlatformNotSupported.cs

…otSupported.cs

…elpers.cs src/tests/JIT/HardwareIntrinsics/Arm/Shared/Helpers.tt

…in lsra.cpp

…alues in multiple registers in lsra.h lsraarm64.cpp lsraxarch.cpp

…ccodegenarm64.cpp

…ic and always use BlkOpKindUnroll for such cases in lowerarmarch.cpp

…odegenarmarch.cpp

… lsraarmarch.cpp

…imd64.cs

echesakov · 2021-06-25T18:03:10Z

@imhameed @fanyang-mono @vargaz @naricc Do you have any guidance how to add support for Arm64 intrinsics returning (Vector64<T>, Vector64<T>)/(Vector128<T>, Vector128<T>) in Mono? Is it even feasible to complete in .NET 6?

echesakov · 2021-06-25T19:22:19Z

It looks that LLVM already folds two consecutive SIMD loads into one SIMD ldp instruction, so, if Mono codegen emits IR as in https://godbolt.org/z/7b66oar6M the output for LoadPairVector64/LoadPairVector128 would be the same as the one generated by the JIT.

imhameed · 2021-06-25T19:38:06Z

Several options for generating an ldp: https://godbolt.org/z/svjEqavPj

The annoying part would be fabricating a ValueTuple (needs an appropriately-sized alloca along with two stores, I guess), which I don't think we have any special handling for, and which I don't know how to do offhand. I'm in the middle of building an arm64 Linux copy of Mono to see what we currently do although this is taking me longer than I'd like because Parallels is mysteriously making my VM drop to 1GB of total memory from 10GB after running for an hour or so... maybe I'll try UTM

imhameed · 2021-06-26T19:14:49Z

@echesakovMSFT here's an implementation of LoadPairVector for Mono that passes the tests you've added: imhameed@ec909cb

The generated code is sometimes good and sometimes bad; we'll need to improve our calling convention and possibly improve our treatment of value types for more reliably good output. LLVM doesn't seem to have any support for lowering !nontemporal loads to ldnp in 11.0.1 or in main so there's not much we can do about that without emitting a small inline assembly blob.

echesakov · 2021-06-29T17:27:34Z

@imhameed Thanks you for the follow up and implementing support on Mono side!

The generated code is sometimes good and sometimes bad; we'll need to improve our calling convention and possibly improve our treatment of value types for more reliably good output. LLVM doesn't seem to have any support for lowering !nontemporal loads to ldnp in 11.0.1 or in main so there's not much we can do about that without emitting a small inline assembly blob.

Do you think we can take your changes to .NET 6.0? If so, can you please push the commit to this PR? I found some issues with CoreCLR implementation of multi-reg nodes. I hope I will resolve them by .NET 6 deadline and, in that case, we can merge both changes to CoreCLR and Mono as one PR.

imhameed · 2021-06-29T17:31:31Z

Yeah I think the Mono changes should be fine for .NET 6. I'll push it to your branch, and merging it as part of this PR sounds perfect to me.

Fabricates a `ValueTuple<T, T>` for the result in a local alloca.

echesakov · 2021-07-08T18:00:31Z

I don't think this will make to .NET 6. I found more issues with multi-register nodes implementation and moving this PR and the corresponding issue to future.

echesakov · 2021-07-08T18:02:56Z

Will re-open after I fix the issues.

kunalspathak · 2021-09-23T19:20:53Z

Tagging myself

dotnet-issue-labeler bot added area-System.Runtime.Intrinsics new-api-needs-documentation labels May 7, 2021

echesakov added arch-arm64 and removed area-System.Runtime.Intrinsics new-api-needs-documentation labels May 7, 2021

echesakov added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label May 7, 2021

echesakov self-assigned this May 7, 2021

echesakov added this to the 6.0.0 milestone May 7, 2021

tannergooding mentioned this pull request May 7, 2021

Copy/paste error in GenTree::GetRegSpillFlagByIdx for xarch hwintrinsics #52473

Closed

echesakov mentioned this pull request May 10, 2021

[Arm64] Planned JIT work in .NET 6 #43629

Closed

29 tasks

runfoapp bot mentioned this pull request May 20, 2021

readytorun/determinism/crossgen2determinism/crossgen2determinism.sh [FAIL] #50466

Closed

ghost closed this Jun 9, 2021

echesakov mentioned this pull request Jun 23, 2021

[Arm64] LoadPairVector64 and LoadPairVector128 #39243

Closed

echesakov reopened this Jun 25, 2021

echesakov added 8 commits June 24, 2021 20:27

Add LoadPairVector64 and LoadPairVector128 in AdvSimd.cs AdvSimd.Plat…

a920b9b

…formNotSupported.cs

Add LoadPairScalarVector64 in AdvSimd.cs AdvSimd.PlatformNotSupported.cs

2900887

Add LoadPairVector64NonTemporal and LoadPairVector128NonTemporal in A…

197bd6f

…dvSimd.cs AdvSimd.PlatformNotSupported.cs

Add LoadPairScalarVector64NonTemporal in AdvSimd.cs AdvSimd.PlatformN…

5286ac9

…otSupported.cs

Update System.Runtime.Intrinsics.cs

ccda678

Add LoadPairScalar() in src/tests/JIT/HardwareIntrinsics/Arm/Shared/H…

dbb5111

…elpers.cs src/tests/JIT/HardwareIntrinsics/Arm/Shared/Helpers.tt

Add LoadPairVectorTest.template

969e5b2

Add LoadPairVector64 and LoadPairVector128 in GenerateTests.csx

6823274

echesakov added 10 commits June 24, 2021 20:28

Support multi-register HW intrinsics on arm64 in lsraAssignRegToTree …

39157b5

…in lsra.cpp

Extend LinearScan::BuildHWIntrinsic to support intrinsics returning v…

200a78a

…alues in multiple registers in lsra.h lsraarm64.cpp lsraxarch.cpp

Allow promotion of HFAs with CUSTOMLAYOUT flag in lclvars.cpp

ff79ad4

Support LoadPairVector128/64 in CodeGen::genHWIntrinsic in hwintrinsi…

a99b2bc

…ccodegenarm64.cpp

Transform GT_STORE_OBJ to GT_STORE_BLK when src is a multireg intrins…

25dcf77

…ic and always use BlkOpKindUnroll for such cases in lowerarmarch.cpp

Support multi-reg intrinsics in CodeGen::genCodeForCpBlkUnroll() in c…

f2f099d

…odegenarmarch.cpp

Consume multi-reg nodes registers in LinearScan::BuildBlockStore() in…

47fb926

… lsraarmarch.cpp

Use AdvSimd.Arm64.LoadPairVector128 in ASCIIUtility.cs

db22dcc

Use AdvSimd.Arm64.StorePair in BitArray.cs

0d45572

Use AdvSimd.Arm64.LoadPairVector128 in OptimizedInboxTextEncoder.AdvS…

fab6ef3

…imd64.cs

echesakov force-pushed the Arm64-ASIMD-LoadPairVector64-LoadPairVector128 branch from b78e415 to fab6ef3 Compare June 25, 2021 03:28

echesakov marked this pull request as ready for review June 25, 2021 03:31

runfoapp bot mentioned this pull request Jun 25, 2021

Test failure System.Threading.Tests.PeriodicTimerTests.WaitForNextTickAsync_CanceledWaitThenWaitAgain_Succeeds #54713

Closed

echesakov marked this pull request as draft June 25, 2021 21:34

[mono] Implement LoadPair{,Scalar}Vector{64,128}{,NonTemporal}

82bb175

Fabricates a `ValueTuple<T, T>` for the result in a local alloca.

echesakov modified the milestones: 6.0.0, Future Jul 8, 2021

echesakov closed this Jul 8, 2021

imhameed mentioned this pull request Jul 8, 2021

[mono] Tracking: Intrinsics implementation #43051

Open

58 tasks

ghost locked as resolved and limited conversation to collaborators Aug 7, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

echesakov commented May 7, 2021 •

edited

Loading

dotnet-issue-labeler bot commented May 7, 2021

ghost commented May 7, 2021

am11 commented May 7, 2021

echesakov commented May 10, 2021

ghost commented Jun 9, 2021

echesakov commented Jun 25, 2021

echesakov commented Jun 25, 2021

imhameed commented Jun 25, 2021

imhameed commented Jun 26, 2021

echesakov commented Jun 29, 2021

imhameed commented Jun 29, 2021

echesakov commented Jul 8, 2021

echesakov commented Jul 8, 2021

kunalspathak commented Sep 23, 2021

[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

Conversation

echesakov commented May 7, 2021 • edited Loading

dotnet-issue-labeler bot commented May 7, 2021

ghost commented May 7, 2021

am11 commented May 7, 2021

echesakov commented May 10, 2021

ghost commented Jun 9, 2021

echesakov commented Jun 25, 2021

echesakov commented Jun 25, 2021

imhameed commented Jun 25, 2021

imhameed commented Jun 26, 2021

echesakov commented Jun 29, 2021

imhameed commented Jun 29, 2021

echesakov commented Jul 8, 2021

echesakov commented Jul 8, 2021

kunalspathak commented Sep 23, 2021

echesakov commented May 7, 2021 •

edited

Loading