Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bad performing code from #[deriving(Eq)] for enums #13623

Closed
dotdash opened this issue Apr 19, 2014 · 9 comments
Closed

Bad performing code from #[deriving(Eq)] for enums #13623

dotdash opened this issue Apr 19, 2014 · 9 comments
Labels
A-codegen Area: Code generation I-compiletime Issue: Problems and improvements with respect to compile times. I-slow Issue: Problems and improvements with respect to performance of generated code.

Comments

@dotdash
Copy link
Contributor

dotdash commented Apr 19, 2014

#[deriving(Eq)] generates nested matches for enums, like this:

#[deriving(Eq)]
pub enum Foo { A1, A2, A3, }
#[automatically_derived]
impl ::std::cmp::Eq for Foo {
    #[inline]
    fn eq(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => true, _ => false },
            A2 => match *__arg_0 { A2 => true, _ => false },
            A3 => match *__arg_0 { A3 => true, _ => false }
        }
    }
    #[inline]
    fn ne(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => false, _ => true },
            A2 => match *__arg_0 { A2 => false, _ => true },
            A3 => match *__arg_0 { A3 => false, _ => true }
        }
    }
}

Although we emit range constraints for the loads, this doesn't get optimized to
a simple comparison but this:

define i8 @_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  switch i8 %2, label %match_else [
    i8 0, label %match_case
    i8 1, label %match_case3
  ]

match_else:                                       ; preds = %entry-block
  %3 = load i8* %1, align 1, !range !0
  %cond20 = icmp eq i8 %3, 2
  %. = zext i1 %cond20 to i8
  br label %join18

match_case:                                       ; preds = %entry-block
  %4 = load i8* %1, align 1, !range !0
  %cond19 = icmp eq i8 %4, 0
  %.23 = zext i1 %cond19 to i8
  br label %join18

match_case3:                                      ; preds = %entry-block
  %5 = load i8* %1, align 1, !range !0
  %cond = icmp eq i8 %5, 1
  %.24 = zext i1 %cond to i8
  br label %join18

join18:                                           ; preds = %match_case3, %match_case, %match_else
  %__make_return_pointer.0 = phi i8 [ %., %match_else ], [ %.23, %match_case ], [ %.24, %match_case3 ]
  ret i8 %__make_return_pointer.0
}

Which gets a pretty straight-forward translation to assembly:

    .globl  _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .align  16, 0x90
    .type   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E,@function
_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movb    (%rdi), %al
    testb   %al, %al
    jne .LBB0_3
    cmpb    $0, (%rsi)
    sete    %al
    retq
.LBB0_3:
    movzbl  %al, %eax
    cmpl    $1, %eax
    jne .LBB0_4
    movzbl  (%rsi), %eax
    cmpl    $1, %eax
    sete    %al
    retq
.LBB0_4:
    movzbl  (%rsi), %eax
    cmpl    $2, %eax
    sete    %al
    retq
.Ltmp0:
    .size   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E, .Ltmp0-_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .cfi_endproc

For C-style enums, we could maybe special case the deriving code generate something like this instead:

fn eq(&self, rhs: &Foo) -> bool {
    *self as u64 == *rhs as u64
}

But since the discriminant isn't exposed in the language (and is only virtual in case of e.g. Option<&int>), I don't see how to solve this for ADTs in general.

Exposing the discriminant through an intrinsic and doing an early equality check helps LLVM to generate better code, but it's still not optimal.

pub fn test1(a: &Foo, b: &Foo) -> bool {
    let d1 = unsafe { std::intrinsics::get_disr(a) };
    let d2 = unsafe { std::intrinsics::get_disr(b) };

    if d1 != d2 {
        return false;
    }

    match a {
        &A1 => match b { &A1 => true, _ => false },
        &A2 => match b { &A2 => true, _ => false },
        &A3 => match b { &A3 => true, _ => false },
    }
}

Gives:

define zeroext i1 @_ZN5test120h51356ea9d1622fa9haa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %next-block, label %return

next-block:                                       ; preds = %entry-block
  %5 = icmp ne i8 %2, 3
  ret i1 %5

return:                                           ; preds = %entry-block
  ret i1 false
}

And:

    .globl  _ZN5test120h51356ea9d1622fa9haa4v0.0E
    .align  16, 0x90
    .type   _ZN5test120h51356ea9d1622fa9haa4v0.0E,@function
_ZN5test120h51356ea9d1622fa9haa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movzbl  (%rdi), %eax
    movzbl  (%rsi), %ecx
    cmpl    %ecx, %eax
    jne .LBB0_4
    movzbl  %al, %eax
    cmpl    $3, %eax
    setne   %al
    retq
.LBB0_4:
    xorl    %eax, %eax
    retq
.Ltmp0:
    .size   _ZN5test120h51356ea9d1622fa9haa4v0.0E, .Ltmp0-_ZN5test120h51356ea9d1622fa9haa4v0.0E
    .cfi_endproc

But I don't think exposing the discriminant is such a great idea, and having to do the extra check is weird and not quite natural. So I wonder if there's a better way to handle this.


Fun fact: Adding another variant to the enum (so we have pub enum Foo { A1, A2, A3, A4 }) makes LLVM generate better code for some reason:

define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %select.end, label %select.mid

select.mid:                                       ; preds = %entry-block
  br label %select.end

select.end:                                       ; preds = %entry-block, %select.mid
  %. = phi i1 [ true, %entry-block ], [ false, %select.mid ]
  ret i1 %.
}

And adding another round of opt -O2 (LLVM seems to stop optimizing too early here) gives:

define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  ret i1 %4
}

EDIT: Forgot to mention that the code with the intrinsic is based upon my branch that uses i1 for bools. Just to avoid any confusion...

@pnkfelix
Copy link
Member

pnkfelix commented Aug 6, 2014

Note that a lot of the notes in the bug description above may be out-of-date since the deriving code generation strategy has changed in #15503

(However, the strategy from #15503 was only addressing an asymptotic performance issue, not micro-optimization of the resulting code in the manner outlined in the bug described here. Basically someone should go through and check how bad the code generation is now, and look over the follow-on bugs linked from #15503 to see if the other changes that we should consider making could also help in this specific case.)

@dotdash
Copy link
Contributor Author

dotdash commented Aug 7, 2014

IR and asm results are pretty much the same. The only thing that got better is related to bool-as-i1, the zero-extending is gone.

Expanded code:

pub enum Foo { A1, A2, A3, }
#[automatically_derived]
impl ::std::cmp::PartialEq for Foo {
    #[inline]
    fn eq(&self, __arg_0: &Foo) -> ::bool {
        match (&*self, &*__arg_0) {
            (&A1, &A1) => true,
            (&A2, &A2) => true,
            (&A3, &A3) => true,
            _ => {
                let __self_vi =
                    match *self { A1(..) => 0u, A2(..) => 1u, A3(..) => 2u, };
                let __arg_1_vi =
                    match *__arg_0 {
                        A1(..) => 0u,
                        A2(..) => 1u,
                        A3(..) => 2u,
                    };
                false
            }
        }
    }
    #[inline]
    fn ne(&self, __arg_0: &Foo) -> ::bool {
        match (&*self, &*__arg_0) {
            (&A1, &A1) => false,
            (&A2, &A2) => false,
            (&A3, &A3) => false,
            _ => {
                let __self_vi =
                    match *self { A1(..) => 0u, A2(..) => 1u, A3(..) => 2u, };
                let __arg_1_vi =
                    match *__arg_0 {
                        A1(..) => 0u,
                        A2(..) => 1u,
                        A3(..) => 2u,
                    };
                true
            }
        }
    }
}

Optimized IR for eq:

define zeroext i1 @_ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE(i8* noalias nocapture readonly dereferenceable(1), i8* noalias nocapture readonly dereferenceable(1)) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  switch i8 %2, label %case_body3 [
    i8 0, label %match_case
    i8 1, label %match_case6
    i8 2, label %match_case9
  ]

case_body3:                                       ; preds = %match_case9, %match_case6, %match_case, %entry-block
  br label %join25

match_case:                                       ; preds = %entry-block
  %3 = load i8* %1, align 1, !range !0
  %cond27 = icmp eq i8 %3, 0
  br i1 %cond27, label %join25, label %case_body3

match_case6:                                      ; preds = %entry-block
  %4 = load i8* %1, align 1, !range !0
  %cond26 = icmp eq i8 %4, 1
  br i1 %cond26, label %join25, label %case_body3

match_case9:                                      ; preds = %entry-block
  %5 = load i8* %1, align 1, !range !0
  %cond = icmp eq i8 %5, 2
  br i1 %cond, label %join25, label %case_body3

join25:                                           ; preds = %match_case9, %match_case6, %match_case, %case_body3
  %__make_return_pointer.0 = phi i1 [ false, %case_body3 ], [ true, %match_case ], [ true, %match_case6 ], [ true, %match_case9 ]
  ret i1 %__make_return_pointer.0
}

Optimized assembler for eq:

    .section    .text._ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE,"ax",@progbits
    .globl  _ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE
    .align  16, 0x90
    .type   _ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE,@function
_ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE:
    .cfi_startproc
    movb    (%rdi), %al
    testb   %al, %al
    je  .LBB0_4
    movzbl  %al, %eax
    cmpl    $1, %eax
    jne .LBB0_2
    movzbl  (%rsi), %ecx
    movb    $1, %al
    cmpl    $1, %ecx
    jne .LBB0_3
    jmp .LBB0_7
.LBB0_4:
    movb    $1, %al
    cmpb    $0, (%rsi)
    jne .LBB0_3
    jmp .LBB0_7
.LBB0_2:
    cmpl    $2, %eax
    jne .LBB0_3
    movb    $1, %al
    movzbl  (%rsi), %ecx
    cmpl    $2, %ecx
    jne .LBB0_3
.LBB0_7:
    retq
.LBB0_3:
    xorl    %eax, %eax
    retq
.Ltmp0:
    .size   _ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE, .Ltmp0-_ZN25Foo...std..cmp..PartialEq2eq20h0918b2157c9512e2laaE
    .cfi_endproc

@steveklabnik
Copy link
Member

Triage: I am an llvm-ir noob, but

$ rustc hello.rs --pretty=expanded -Z unstable-options
#![feature(no_std)]
#![no_std]
#[prelude_import]
use std::prelude::v1::*;
#[macro_use]
extern crate std as std;
pub enum Foo { A1, A2, A3, }
#[automatically_derived]
impl ::std::cmp::Eq for Foo {
    #[inline]
    #[doc(hidden)]
    fn assert_receiver_is_total_eq(&self) -> () {
        match (&*self,) {
            (&Foo::A1,) => { }
            (&Foo::A2,) => { }
            (&Foo::A3,) => { }
        }
    }
}
#[automatically_derived]
impl ::std::cmp::PartialEq for Foo {
    #[inline]
    fn eq(&self, __arg_0: &Foo) -> bool {
        match (&*self, &*__arg_0) {
            (&Foo::A1, &Foo::A1) => true,
            (&Foo::A2, &Foo::A2) => true,
            (&Foo::A3, &Foo::A3) => true,
            _ => {
                let __self_vi =
                    unsafe { ::std::intrinsics::discriminant_value(&*self) }
                        as i32;
                let __arg_1_vi =
                    unsafe {
                        ::std::intrinsics::discriminant_value(&*__arg_0)
                    } as i32;
                false
            }
        }
    }
    #[inline]
    fn ne(&self, __arg_0: &Foo) -> bool {
        match (&*self, &*__arg_0) {
            (&Foo::A1, &Foo::A1) => false,
            (&Foo::A2, &Foo::A2) => false,
            (&Foo::A3, &Foo::A3) => false,
            _ => {
                let __self_vi =
                    unsafe { ::std::intrinsics::discriminant_value(&*self) }
                        as i32;
                let __arg_1_vi =
                    unsafe {
                        ::std::intrinsics::discriminant_value(&*__arg_0)
                    } as i32;
                true
            }
        }
    }
}

fn main() { }

We're now using the discriminant_value intrinsic. So maybe this ticket should be closed?

@dotdash
Copy link
Contributor Author

dotdash commented Apr 30, 2015

No, this still generates awful IR/asm. LLVM can only make use of the extra information if the comparison using the intrinsic happens before the match, which is not the case.

@pnkfelix
Copy link
Member

I bet we can revise the deriving code to extract those discriminant values more eagerly.

Even my original proposal indicated that we should do

let discrim_self = ...;
let discrim_arg_0 = ...;

at the outset, before the match.

The current code isn't doing that, largely due to historical artifacts (i.e. I was trying to land the fix for quadratic blowup long before we had approved the addition of the discriminant value intrinsic). But I bet it might not be too hard to make it start doing it now.

@pnkfelix
Copy link
Member

and also ... what the heck is going on in that code that @steveklabnik posted? Are we really extracting the discriminant value but then never using it? what the ...

Update: Oh, right: that's an artifact of how the deriving implementation uses the same code generator for many (all?) of the derived forms: so even though we do not us the extracted discriminant value in Eq, we do use it e.g. for Ord.

@dotdash
Copy link
Contributor Author

dotdash commented Sep 1, 2015

C-like enums optimize well now, but others still generate sub-optimal code, especially at the IR leve.

Example:

#[derive(PartialEq)]
enum E {
  A(i32),
  B(i32),
  C(i32),
  D(i32),
  A1(i32),
  B1(i32),
  C1(i32),
  D1(i32),
}

IR:

define zeroext i1 @_ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE(%E* noalias nocapture readonly dereferenceable(8), %E* noalias nocapture readonly dereferenceable(8)) unnamed_addr #0 {
entry-block:
  %2 = getelementptr inbounds %E, %E* %0, i64 0, i32 0
  %3 = load i32, i32* %2, align 4
  %4 = getelementptr inbounds %E, %E* %1, i64 0, i32 0
  %5 = load i32, i32* %4, align 4, !range !0
  %6 = icmp eq i32 %3, %5
  br i1 %6, label %then-block-74-, label %join63

then-block-74-:                                   ; preds = %entry-block
  switch i32 %3, label %match_else [
    i32 0, label %match_case
    i32 1, label %match_case25
    i32 2, label %match_case28
    i32 3, label %match_case31
    i32 4, label %match_case34
    i32 5, label %match_case37
    i32 6, label %match_case40
    i32 7, label %match_case43
  ]

match_else:                                       ; preds = %then-block-74-
  unreachable

match_case:                                       ; preds = %then-block-74-
  %7 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %8 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %9 = load i32, i32* %8, align 4
  %10 = load i32, i32* %7, align 4
  %11 = icmp eq i32 %9, %10
  %12 = zext i1 %11 to i8
  br label %join63

match_case25:                                     ; preds = %then-block-74-
  %13 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %14 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %15 = load i32, i32* %14, align 4
  %16 = load i32, i32* %13, align 4
  %17 = icmp eq i32 %15, %16
  %18 = zext i1 %17 to i8
  br label %join63

match_case28:                                     ; preds = %then-block-74-
  %19 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %20 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %21 = load i32, i32* %20, align 4
  %22 = load i32, i32* %19, align 4
  %23 = icmp eq i32 %21, %22
  %24 = zext i1 %23 to i8
  br label %join63

match_case31:                                     ; preds = %then-block-74-
  %25 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %26 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %27 = load i32, i32* %26, align 4
  %28 = load i32, i32* %25, align 4
  %29 = icmp eq i32 %27, %28
  %30 = zext i1 %29 to i8
  br label %join63

match_case34:                                     ; preds = %then-block-74-
  %31 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %32 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %33 = load i32, i32* %32, align 4
  %34 = load i32, i32* %31, align 4
  %35 = icmp eq i32 %33, %34
  %36 = zext i1 %35 to i8
  br label %join63

match_case37:                                     ; preds = %then-block-74-
  %37 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %38 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %39 = load i32, i32* %38, align 4
  %40 = load i32, i32* %37, align 4
  %41 = icmp eq i32 %39, %40
  %42 = zext i1 %41 to i8
  br label %join63

match_case40:                                     ; preds = %then-block-74-
  %43 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %44 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %45 = load i32, i32* %44, align 4
  %46 = load i32, i32* %43, align 4
  %47 = icmp eq i32 %45, %46
  %48 = zext i1 %47 to i8
  br label %join63

match_case43:                                     ; preds = %then-block-74-
  %49 = getelementptr inbounds %E, %E* %1, i64 0, i32 2, i64 0
  %50 = getelementptr inbounds %E, %E* %0, i64 0, i32 2, i64 0
  %51 = load i32, i32* %50, align 4
  %52 = load i32, i32* %49, align 4
  %53 = icmp eq i32 %51, %52
  %54 = zext i1 %53 to i8
  br label %join63

join63:                                           ; preds = %entry-block, %match_case, %match_case25, %match_case28, %match_case31, %match_case34, %match_case37, %match_case40, %match_case43
  %sret_slot.1 = phi i8 [ %54, %match_case43 ], [ %48, %match_case40 ], [ %42, %match_case37 ], [ %36, %match_case34 ], [ %30, %match_case31 ], [ %24, %match_case28 ], [ %18, %match_case25 ], [ %12, %match_case ], [ 0, %entry-block ]
  %55 = icmp ne i8 %sret_slot.1, 0
  ret i1 %55
}

ASM

.type   _ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE,@function
_ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE:
    .cfi_startproc
    movl    (%rdi), %eax
    cmpl    (%rsi), %eax
    jne .LBB0_1
    decl    %eax
    cmpl    $6, %eax
    jbe .LBB0_3
.LBB0_4:
    movl    4(%rdi), %eax
    cmpl    4(%rsi), %eax
    sete    %al
    retq
.LBB0_1:
    xorl    %eax, %eax
    retq
.LBB0_3:
    leaq    .LJTI0_0(%rip), %rcx
    movslq  (%rcx,%rax,4), %rax
    addq    %rcx, %rax
    jmpq    *%rax
.Lfunc_end0:
    .size   _ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE, .Lfunc_end0-_ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE
    .cfi_endproc
    .section    .rodata._ZN23E...std..cmp..PartialEq2eq20h8d48eb51e9008df6FaaE,"a",@progbits
    .align  4
.LJTI0_0:
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0
    .long   .LBB0_4-.LJTI0_0

@ranma42
Copy link
Contributor

ranma42 commented Sep 1, 2015

cc @ranma42

@brson
Copy link
Contributor

brson commented Mar 9, 2017

Enums have changed a lot, so closing this old issue. If this is still an issue, do repro and reopen.

@brson brson closed this as completed Mar 9, 2017
Manishearth pushed a commit to Manishearth/rust that referenced this issue Nov 23, 2022
…letions, r=jonas-schievink

fix: Strip comments and attributes off of all trait item completions

Previously, this was done in several places redundantly, but not for all items. It was also untested. This PR fixes that.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-codegen Area: Code generation I-compiletime Issue: Problems and improvements with respect to compile times. I-slow Issue: Problems and improvements with respect to performance of generated code.
Projects
None yet
Development

No branches or pull requests

6 participants