Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto merge of #48057 - scottmcm:less-match-more-compare, r=dtolnay
Simplify RangeInclusive::next[_back] `match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying. cc #45222 #28237 (comment) Example: ```rust #[no_mangle] pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 { let mut sum = 0; for i in x { sum += i ^ (i-1); } sum } ``` Today: ```asm coresum: xor r8d, r8d mov r9, -1 xor eax, eax jmp .LBB0_1 .LBB0_4: lea rcx, [rdi - 1] xor rcx, rdi add rax, rcx mov rsi, rdx mov rdi, r10 .LBB0_1: cmp rdi, rsi mov ecx, 1 cmovb rcx, r9 cmove rcx, r8 test rcx, rcx mov edx, 0 mov r10d, 1 je .LBB0_4 // 1 cmp rcx, -1 jne .LBB0_5 // 2 lea r10, [rdi + 1] mov rdx, rsi jmp .LBB0_4 // 3 .LBB0_5: ret ``` With this PR: ```asm coresum: cmp rcx, rdx jbe .LBB0_2 xor eax, eax ret .LBB0_2: xor r8d, r8d mov r9d, 1 xor eax, eax .p2align 4, 0x90 .LBB0_3: lea r10, [rcx + 1] cmp rcx, rdx cmovae rdx, r8 cmovae r10, r9 lea r11, [rcx - 1] xor r11, rcx add rax, r11 mov rcx, r10 cmp r10, rdx jbe .LBB0_3 // Just this ret ``` <details><summary>Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it</summary> ```asm coresum_inner: .Lcfi0: .seh_proc coresum_inner sub rsp, 168 .Lcfi1: .seh_stackalloc 168 vmovdqa xmmword ptr [rsp + 144], xmm15 .Lcfi2: .seh_savexmm 15, 144 vmovdqa xmmword ptr [rsp + 128], xmm14 .Lcfi3: .seh_savexmm 14, 128 vmovdqa xmmword ptr [rsp + 112], xmm13 .Lcfi4: .seh_savexmm 13, 112 vmovdqa xmmword ptr [rsp + 96], xmm12 .Lcfi5: .seh_savexmm 12, 96 vmovdqa xmmword ptr [rsp + 80], xmm11 .Lcfi6: .seh_savexmm 11, 80 vmovdqa xmmword ptr [rsp + 64], xmm10 .Lcfi7: .seh_savexmm 10, 64 vmovdqa xmmword ptr [rsp + 48], xmm9 .Lcfi8: .seh_savexmm 9, 48 vmovdqa xmmword ptr [rsp + 32], xmm8 .Lcfi9: .seh_savexmm 8, 32 vmovdqa xmmword ptr [rsp + 16], xmm7 .Lcfi10: .seh_savexmm 7, 16 vmovdqa xmmword ptr [rsp], xmm6 .Lcfi11: .seh_savexmm 6, 0 .Lcfi12: .seh_endprologue cmp rdx, rcx jae .LBB1_2 xor eax, eax jmp .LBB1_13 .LBB1_2: mov r8, rdx sub r8, rcx jbe .LBB1_3 cmp r8, 7 jbe .LBB1_5 mov rax, r8 and rax, -8 mov r9, r8 and r9, -8 je .LBB1_5 add rax, rcx vmovq xmm0, rcx vpshufd xmm0, xmm0, 68 mov ecx, 1 vmovq xmm1, rcx vpslldq xmm1, xmm1, 8 vpaddq xmm1, xmm0, xmm1 vpxor xmm0, xmm0, xmm0 vpcmpeqd xmm11, xmm11, xmm11 vmovdqa xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001] vmovdqa xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003] vmovdqa xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005] vmovdqa xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008] mov rcx, r9 vpxor xmm4, xmm4, xmm4 vpxor xmm5, xmm5, xmm5 vpxor xmm6, xmm6, xmm6 .p2align 4, 0x90 .LBB1_9: vpaddq xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002] vpaddq xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004] vpaddq xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006] vpaddq xmm8, xmm1, xmm12 vpxor xmm7, xmm8, xmm7 vpaddq xmm2, xmm1, xmm13 vpxor xmm8, xmm2, xmm9 vpaddq xmm3, xmm1, xmm14 vpxor xmm3, xmm3, xmm10 vpaddq xmm2, xmm1, xmm11 vpxor xmm2, xmm2, xmm1 vpaddq xmm0, xmm2, xmm0 vpaddq xmm4, xmm7, xmm4 vpaddq xmm5, xmm8, xmm5 vpaddq xmm6, xmm3, xmm6 vpaddq xmm1, xmm1, xmm15 add rcx, -8 jne .LBB1_9 vpaddq xmm0, xmm4, xmm0 vpaddq xmm0, xmm5, xmm0 vpaddq xmm0, xmm6, xmm0 vpshufd xmm1, xmm0, 78 vpaddq xmm0, xmm0, xmm1 vmovq r10, xmm0 cmp r8, r9 jne .LBB1_6 jmp .LBB1_11 .LBB1_3: xor r10d, r10d jmp .LBB1_12 .LBB1_5: xor r10d, r10d mov rax, rcx .p2align 4, 0x90 .LBB1_6: lea rcx, [rax - 1] xor rcx, rax inc rax add r10, rcx cmp rdx, rax jne .LBB1_6 .LBB1_11: mov rcx, rdx .LBB1_12: lea rax, [rcx - 1] xor rax, rcx add rax, r10 .LBB1_13: vmovaps xmm6, xmmword ptr [rsp] vmovaps xmm7, xmmword ptr [rsp + 16] vmovaps xmm8, xmmword ptr [rsp + 32] vmovaps xmm9, xmmword ptr [rsp + 48] vmovaps xmm10, xmmword ptr [rsp + 64] vmovaps xmm11, xmmword ptr [rsp + 80] vmovaps xmm12, xmmword ptr [rsp + 96] vmovaps xmm13, xmmword ptr [rsp + 112] vmovaps xmm14, xmmword ptr [rsp + 128] vmovaps xmm15, xmmword ptr [rsp + 144] add rsp, 168 ret .seh_handlerdata .section .text,"xr",one_only,coresum_inner .Lcfi13: .seh_endproc ``` </details>
- Loading branch information