Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RyuJIT][LSRA] Let variables within a loop use register first #8846

Open
303248153 opened this issue Sep 1, 2017 · 4 comments
Open

[RyuJIT][LSRA] Let variables within a loop use register first #8846

303248153 opened this issue Sep 1, 2017 · 4 comments
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI enhancement Product code improvement that does NOT require public API changes/additions optimization tenet-performance Performance related issue
Milestone

Comments

@303248153
Copy link
Contributor

Consider this code:

[MethodImpl(MethodImplOptions.NoInlining)]
static int GetRandom() {
	return 0;
}

[MethodImpl(MethodImplOptions.NoInlining)]
static void Print(int x) {
	// Console.WriteLine(x);
}

static void SampleA() {
	var a = GetRandom();
	var b = GetRandom();
	var c = GetRandom();
	var d = GetRandom();
	var e = GetRandom();
	var f = GetRandom();
	var g = GetRandom();
	var h = GetRandom();
	for (var x = 0; x < 100; ++x) {
		var xa = GetRandom();
		var xb = GetRandom();
		var xc = GetRandom();
		var xd = GetRandom();
		var xe = GetRandom();
		var xf = GetRandom();
		Print(xa);
		Print(xb);
		Print(xc);
		Print(xd);
		Print(xe);
		Print(xf);
	}
	Print(a);
	Print(b);
	Print(c);
	Print(d);
	Print(e);
	Print(f);
	Print(g);
	Print(h);
}

it generated following assembly code on Windows x64 .Net Core 2.0

  push        r14  
00007FFC38150544 41 55                push        r13  
00007FFC38150546 41 54                push        r12  
00007FFC38150548 57                   push        rdi  
00007FFC38150549 56                   push        rsi  
00007FFC3815054A 55                   push        rbp  
00007FFC3815054B 53                   push        rbx  
00007FFC3815054C 48 83 EC 48          sub         rsp,48h  
00007FFC38150550 E8 63 FB FF FF       call        00007FFC381500B8  
00007FFC38150555 8B F0                mov         esi,eax  
   192: 			var b = GetRandom();
00007FFC38150557 E8 5C FB FF FF       call        00007FFC381500B8  
00007FFC3815055C 8B F8                mov         edi,eax  
   193: 			var c = GetRandom();
00007FFC3815055E E8 55 FB FF FF       call        00007FFC381500B8  
00007FFC38150563 8B D8                mov         ebx,eax  
   194: 			var d = GetRandom();
00007FFC38150565 E8 4E FB FF FF       call        00007FFC381500B8  
00007FFC3815056A 8B E8                mov         ebp,eax  
   195: 			var e = GetRandom();
00007FFC3815056C E8 47 FB FF FF       call        00007FFC381500B8  
   195: 			var e = GetRandom();
00007FFC38150571 44 8B F0             mov         r14d,eax  
   196: 			var f = GetRandom();
00007FFC38150574 E8 3F FB FF FF       call        00007FFC381500B8  
00007FFC38150579 44 8B F8             mov         r15d,eax  
   197: 			var g = GetRandom();
00007FFC3815057C E8 37 FB FF FF       call        00007FFC381500B8  
00007FFC38150581 44 8B E0             mov         r12d,eax  
   198: 			var h = GetRandom();
00007FFC38150584 E8 2F FB FF FF       call        00007FFC381500B8  
00007FFC38150589 44 8B E8             mov         r13d,eax  
   199: 			for (var x = 0; x < 100; ++x) {
00007FFC3815058C 33 C0                xor         eax,eax  
00007FFC3815058E 89 44 24 44          mov         dword ptr [rsp+44h],eax  
   200: 				var xa = GetRandom();
00007FFC38150592 E8 21 FB FF FF       call        00007FFC381500B8  
00007FFC38150597 89 44 24 40          mov         dword ptr [rsp+40h],eax  
   201: 				var xb = GetRandom();
00007FFC3815059B E8 18 FB FF FF       call        00007FFC381500B8  
00007FFC381505A0 89 44 24 3C          mov         dword ptr [rsp+3Ch],eax  
   202: 				var xc = GetRandom();
00007FFC381505A4 E8 0F FB FF FF       call        00007FFC381500B8  
00007FFC381505A9 89 44 24 38          mov         dword ptr [rsp+38h],eax  
   203: 				var xd = GetRandom();
00007FFC381505AD E8 06 FB FF FF       call        00007FFC381500B8  
00007FFC381505B2 89 44 24 34          mov         dword ptr [rsp+34h],eax  
   204: 				var xe = GetRandom();
00007FFC381505B6 E8 FD FA FF FF       call        00007FFC381500B8  
00007FFC381505BB 89 44 24 30          mov         dword ptr [rsp+30h],eax  
   205: 				var xf = GetRandom();
00007FFC381505BF E8 F4 FA FF FF       call        00007FFC381500B8  
00007FFC381505C4 89 44 24 2C          mov         dword ptr [rsp+2Ch],eax  
00007FFC381505C8 8B 4C 24 40          mov         ecx,dword ptr [rsp+40h]  
00007FFC381505CC E8 EF FA FF FF       call        00007FFC381500C0  
   207: 				Print(xb);
00007FFC381505D1 8B 4C 24 3C          mov         ecx,dword ptr [rsp+3Ch]  
00007FFC381505D5 E8 E6 FA FF FF       call        00007FFC381500C0  
   208: 				Print(xc);
00007FFC381505DA 8B 4C 24 38          mov         ecx,dword ptr [rsp+38h]  
00007FFC381505DE E8 DD FA FF FF       call        00007FFC381500C0  
   209: 				Print(xd);
00007FFC381505E3 8B 4C 24 34          mov         ecx,dword ptr [rsp+34h]  
00007FFC381505E7 E8 D4 FA FF FF       call        00007FFC381500C0  
   210: 				Print(xe);
00007FFC381505EC 8B 4C 24 30          mov         ecx,dword ptr [rsp+30h]  
   210: 				Print(xe);
00007FFC381505F0 E8 CB FA FF FF       call        00007FFC381500C0  
   211: 				Print(xf);
00007FFC381505F5 8B 4C 24 2C          mov         ecx,dword ptr [rsp+2Ch]  
00007FFC381505F9 E8 C2 FA FF FF       call        00007FFC381500C0  
   199: 			for (var x = 0; x < 100; ++x) {
00007FFC381505FE 8B 4C 24 44          mov         ecx,dword ptr [rsp+44h]  
00007FFC38150602 FF C1                inc         ecx  
00007FFC38150604 83 F9 64             cmp         ecx,64h  
00007FFC38150607 89 4C 24 44          mov         dword ptr [rsp+44h],ecx  
00007FFC3815060B 7C 85                jl          00007FFC38150592  
   212: 			}
   213: 			Print(a);
00007FFC3815060D 8B CE                mov         ecx,esi  
00007FFC3815060F E8 AC FA FF FF       call        00007FFC381500C0  
   214: 			Print(b);
00007FFC38150614 8B CF                mov         ecx,edi  
   214: 			Print(b);
00007FFC38150616 E8 A5 FA FF FF       call        00007FFC381500C0  
   215: 			Print(c);
00007FFC3815061B 8B CB                mov         ecx,ebx  
00007FFC3815061D E8 9E FA FF FF       call        00007FFC381500C0  
   216: 			Print(d);
00007FFC38150622 8B CD                mov         ecx,ebp  
00007FFC38150624 E8 97 FA FF FF       call        00007FFC381500C0  
   217: 			Print(e);
00007FFC38150629 41 8B CE             mov         ecx,r14d  
00007FFC3815062C E8 8F FA FF FF       call        00007FFC381500C0  
   218: 			Print(f);
00007FFC38150631 41 8B CF             mov         ecx,r15d  
00007FFC38150634 E8 87 FA FF FF       call        00007FFC381500C0  
   219: 			Print(g);
00007FFC38150639 41 8B CC             mov         ecx,r12d  
00007FFC3815063C E8 7F FA FF FF       call        00007FFC381500C0  
   220: 			Print(h);
00007FFC38150641 41 8B CD             mov         ecx,r13d  
00007FFC38150644 48 B8 C0 00 15 38 FC 7F 00 00 mov         rax,7FFC381500C0h  
00007FFC3815064E 48 83 C4 48          add         rsp,48h  
00007FFC38150652 5B                   pop         rbx  
00007FFC38150653 5D                   pop         rbp  
00007FFC38150654 5E                   pop         rsi  
00007FFC38150655 5F                   pop         rdi  
00007FFC38150656 41 5C                pop         r12  
00007FFC38150658 41 5D                pop         r13  
00007FFC3815065A 41 5E                pop         r14  
00007FFC3815065C 41 5F                pop         r15  
00007FFC3815065E 48 FF E0             jmp         rax 

As we can see a~h is holding register esi~r13 yet xa~xf always use stack.

Then I write another version:

static volatile int Dummy;

static void SampleB() {
	var xa = Dummy;
	var xb = Dummy;
	var xc = Dummy;
	var xd = Dummy;
	var xe = Dummy;
	var xf = Dummy;
	var a = GetRandom();
	var b = GetRandom();
	var c = GetRandom();
	var d = GetRandom();
	var e = GetRandom();
	var f = GetRandom();
	var g = GetRandom();
	var h = GetRandom();
	Dummy = xa;
	Dummy = xb;
	Dummy = xc;
	Dummy = xd;
	Dummy = xe;
	Dummy = xf;
	for (var x = 0; x < 100; ++x) {
		xa = GetRandom();
		xb = GetRandom();
		xc = GetRandom();
		xd = GetRandom();
		xe = GetRandom();
		xf = GetRandom();
		Print(xa);
		Print(xb);
		Print(xc);
		Print(xd);
		Print(xe);
		Print(xf);
	}
	Print(a);
	Print(b);
	Print(c);
	Print(d);
	Print(e);
	Print(f);
	Print(g);
	Print(h);
}

And run a benchmark, gives

1000000 loops SampleA: 2.2854432
1000000 loops SampleB: 1.6177438

SampleB is much faster because it let xa~xf use register, the following 100 cycles will also use these registers.

I dig a bit with lldb on Linux x64, here is what's happing during LSRA form SampleA:
Notice it's on Linux x64 so the registers doesn't match with the assembly code above.

  • tryAllocateFreeReg(interval of a, refposition of st.lclvar a) gives rbx
  • tryAllocateFreeReg(interval of b, refposition of st.lclvar b) gives r14
  • tryAllocateFreeReg(interval of c, refposition of st.lclvar c) gives r15
  • tryAllocateFreeReg(interval of d, refposition of st.lclvar d) gives r12
  • tryAllocateFreeReg(interval of e, refposition of st.lclvar e) gives r13
  • tryAllocateFreeReg(interval of f, refposition of st.lclvar f) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of f holding rax will spill
  • tryAllocateFreeReg(interval of g, refposition of st.lclvar g) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of g holding rax will spill
  • tryAllocateFreeReg(interval of h, refposition of st.lclvar h) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of h holding rax will spill
  • tryAllocateFreeReg(interval of xa, refposition of st.lclvar xa) gives rax
  • handle refposition of call, because it's isPhysRegRef, the interval of xa holding rax will spill
  • and so on...

During the process, allocateBusyReg is not used at all,
I can see getWeight(refPos of st.lclvar a) is 200 and getWeight(refPos of st.lclvar xa) is 800, but xa has no chance to get the register.

On the contrary, gcc and clang can handle this situation very well:

godblot

category:cq
theme:register-allocator
skill-level:expert
cost:large

@BruceForstall
Copy link
Member

@CarolEidt @dotnet/jit-contrib

@CarolEidt CarolEidt changed the title [RyuJIT] Let variables within a loop use register first [RyuJIT][LSRA] Let variables within a loop use register first Sep 1, 2017
@CarolEidt
Copy link
Contributor

This is related to #6825 and #8019, though I thought there was already an issue specifically for the loop case.

I believe that the right way to handle the loop issue, beyond (or in addition to) #8019, is to not blindly use the variable assignments of a predecessor when entering the loop. Instead, evict any lclVars unused within the loop as needed to ensure that as many loop variables as possible are in registers.

@msftgits msftgits transferred this issue from dotnet/coreclr Jan 31, 2020
@msftgits msftgits added this to the Future milestone Jan 31, 2020
@CarolEidt
Copy link
Contributor

Another case where we're spilling in a loop is in the inner loop of ConsoleMandel on arm64. Here is the inner loop for the FloatNoADT case where V16 is spilled instead of freeing up another register:

G_M3984_IG05:		;; bbWeight=64   
            fmul    v20.4s, v17.4s, v17.4s
            fmul    v21.4s, v18.4s, v18.4s
            fsub    v20.4s, v20.4s, v21.4s
            fmul    v21.4s, v17.4s, v18.4s
            fmul    v17.4s, v17.4s, v18.4s
            fadd    v18.4s, v21.4s, v17.4s
            str     q16, [fp,#32]	// [V16 loc10]
            fadd    v17.4s, v20.4s, v16.4s
            fadd    v18.4s, v18.4s, v8.4s
            add     v11.4s, v11.4s, v19.4s
            fmul    v20.4s, v17.4s, v17.4s
            fmul    v21.4s, v18.4s, v18.4s
            fadd    v20.4s, v20.4s, v21.4s
            fcmge   v20.4s, v12.4s, v20.4s
            cmge    v21.4s, v9.4s, v11.4s
            and     v20.4s, v20.4s, v21.4s
            and     v19.4s, v19.4s, v20.4s
            dup     v20.4s, wzr
            cmeq    v21.4s, v19.4s, v20.4s
            uminv   v21.16b, v21.16b
            mov     w0, v21.b[0]
            eor     w0, w0, #1
            and     w0, w0, #1
            ldr     q16, [fp,#32]	// [V16 loc10] 
            cbnz    w0, G_M3984_IG05

Once #31683 is fixed, there may be fewer registers used within the loop, so it may no longer manifest.

@CarolEidt CarolEidt modified the milestones: Future, 6.0.0 Oct 27, 2020
@JulieLeeMSFT JulieLeeMSFT added the needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration label Mar 23, 2021
@kunalspathak kunalspathak removed the needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration label Apr 9, 2021
@kunalspathak kunalspathak modified the milestones: 6.0.0, Future Apr 9, 2021
@kunalspathak
Copy link
Member

We want to tune our overall heuristics based on the experiments we do as part of #43318. While we will do the experiments in .NET 6, it is highly unlikely that we would bake that in produce during .NET 6. Moving to future.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI enhancement Product code improvement that does NOT require public API changes/additions optimization tenet-performance Performance related issue
Projects
None yet
Development

No branches or pull requests

6 participants