Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Immutable constructors should be inlined even when they have a lot of arguments #12165

Closed
simonster opened this issue Jul 15, 2015 · 9 comments
Closed
Labels
compiler:codegen Generation of LLVM IR and native code performance Must go faster

Comments

@simonster
Copy link
Member

e.g.:

julia> immutable X
           a::Int
           b::Int
           c::Int
           d::Int
           e::Int
           f::Int
           g::Int
           h::Int
           i::Int
       end

julia> f() = X(1, 2, 3, 4, 5, 6, 7, 8, 9);

julia> @code_llvm f()

define %X @julia_f_21095() {
top:
  %0 = load %jl_value_t** inttoptr (i64 4589611400 to %jl_value_t**), align 8
  %1 = call %X @julia_call_21096(%jl_value_t* %0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
  ret %X %1
}

The work being done here still needs to be done even if the constructor is not inlined. All making a function call to the constructor does is needlessly move memory around and make it harder for LLVM to optimize things.

@simonster simonster added the performance Must go faster label Jul 15, 2015
@JeffBezanson
Copy link
Sponsor Member

Inlining big things everywhere has a cost in code size.

@JeffBezanson JeffBezanson removed the performance Must go faster label Jul 15, 2015
@JeffBezanson
Copy link
Sponsor Member

Removing performance tag until there is performance data.

@JeffBezanson JeffBezanson added the compiler:codegen Generation of LLVM IR and native code label Jul 15, 2015
@vtjnash
Copy link
Sponsor Member

vtjnash commented Jul 15, 2015

there's a likelihood that the inlining threshold should be proportional to the number of arguments to the function.

@simonster
Copy link
Member Author

The code should actually be smaller for this case if you inline. Here's what it looks like just below the inlining threshold:

julia> immutable X
           a::Int
           b::Int
           c::Int
           d::Int
           e::Int
           f::Int
           g::Int
       end

julia> f() = X(1, 2, 3, 4, 5, 6, 7);

julia> @code_native f()
    .section    __TEXT,__text,regular,pure_instructions
Filename: none
Source line: 1
    pushq   %rbp
    movq    %rsp, %rbp
Source line: 1
    movq    $7, 48(%rdi)
    movq    $6, 40(%rdi)
    movq    $5, 32(%rdi)
    movq    $4, 24(%rdi)
    movq    $3, 16(%rdi)
    movq    $2, 8(%rdi)
    movq    $1, (%rdi)
    popq    %rbp
    ret

and here's an example just above the inlining threshold:

julia> immutable X
           a::Int
           b::Int
           c::Int
           d::Int
           e::Int
           f::Int
           g::Int
           i::Int
       end

julia> f() = X(1, 2, 3, 4, 5, 6, 7, 8);

julia> @code_native f()
    .section    __TEXT,__text,regular,pure_instructions
Filename: none
Source line: 1
    pushq   %rbp
    movq    %rsp, %rbp
    pushq   %rbx
    subq    $104, %rsp
    movq    %rdi, %rbx
    movabsq $4560738376, %rax       ## imm = 0x10FD75848
Source line: 1
    movq    (%rax), %rsi
    movq    $8, 24(%rsp)
    movq    $7, 16(%rsp)
    movq    $6, 8(%rsp)
    movq    $5, (%rsp)
    leaq    -72(%rbp), %rdi
    movabsq $13335795376, %rax      ## imm = 0x31AE016B0
    movl    $1, %edx
    movl    $2, %ecx
    movl    $3, %r8d
    movl    $4, %r9d
    callq   *%rax
    movq    -40(%rbp), %rax
    movq    -32(%rbp), %rcx
    movq    -24(%rbp), %rdx
    movq    -16(%rbp), %rsi
    movq    %rsi, 56(%rbx)
    movq    %rdx, 48(%rbx)
    movq    %rcx, 40(%rbx)
    movq    %rax, 32(%rbx)
    movq    -48(%rbp), %rax
    movq    %rax, 24(%rbx)
    movq    -56(%rbp), %rax
    movq    %rax, 16(%rbx)
    movq    -72(%rbp), %rax
    movq    -64(%rbp), %rcx
    movq    %rcx, 8(%rbx)
    movq    %rax, (%rbx)
    addq    $104, %rsp
    popq    %rbx
    popq    %rbp
    ret

@JeffBezanson
Copy link
Sponsor Member

Yes, @vtjnash is probably right that having more arguments should increase the chance of inlining, especially once you get to more than ~5 arguments or so.

@Keno
Copy link
Member

Keno commented Jul 15, 2015

Another big issue here is the lack of sret on that return parameter causing the extra copy after the callq.

@simonster
Copy link
Member Author

Okay, here's a stripped down version of the code where I noticed that the code I got was not what I expected. When this can be inlined, not only is there less unnecessary memory movement going on, but LLVM can perform fewer comparisons because it knows exactly what needs to be compared.

immutable FloatingPointDatatype
    class::UInt8
    bitfield1::UInt8
    bitfield2::UInt8
    bitfield3::UInt8
    size::UInt32
    bitoffset::UInt16
    bitprecision::UInt16
    exponentlocation::UInt8
    exponentsize::UInt8
    mantissalocation::UInt8
    mantissasize::UInt8
    exponentbias::UInt32
end

h5type(::Type{Float16}) =
    FloatingPointDatatype(0x00, 0x20, 0x0f, 0x00, UInt32(2), 0x0000, UInt16(16), UInt8(10), 0x05, 0x00, UInt32(10), 0x0000000f)
h5type(::Type{Float32}) =
    FloatingPointDatatype(0x00, 0x20, 0x1f, 0x00, UInt32(4), 0x0000, UInt16(32), UInt8(23), 0x08, 0x00, UInt32(23), 0x0000007f)
h5type(::Type{Float64}) =
    FloatingPointDatatype(0x00, 0x20, 0x3f, 0x00, UInt32(8), 0x0000, UInt16(64), UInt8(52), 0x0b, 0x00, UInt32(52), 0x000003ff)

immutable UnsupportedFeatureException <: Exception end

function jltype(dt::FloatingPointDatatype)
    if dt == h5type(Float64)
        return 64
    elseif dt == h5type(Float32)
        return 32
    elseif dt == h5type(Float16)
        return 16
    else
        throw(UnsupportedFeatureException())
    end
end

function jltype_test()
    x = fill(h5type(Float16), 1000000)
    y = 0
    @time for i = 1:length(x)
        y += jltype(x[i])
    end
    y
end

I can force inlining by defining:

@eval function jltype(dt::FloatingPointDatatype)
    if dt == $(Expr(:new, :FloatingPointDatatype, 0x00, 0x20, 0x3f, 0x00, UInt32(8), 0x0000, UInt16(64), UInt8(52), 0x0b, 0x00, UInt32(52), 0x000003ff))
        return 64
    elseif dt == $(Expr(:new, :FloatingPointDatatype, 0x00, 0x20, 0x1f, 0x00, UInt32(4), 0x0000, UInt16(32), UInt8(23), 0x08, 0x00, UInt32(23), 0x0000007f))
        return 32
    elseif dt == $(Expr(:new, :FloatingPointDatatype, 0x00, 0x20, 0x0f, 0x00, UInt32(2), 0x0000, UInt16(16), UInt8(10), 0x05, 0x00, UInt32(10), 0x0000000f))
        return 16
    else
        throw(UnsupportedFeatureException())
    end
end

Without inlining:

julia> jltype_test();
  39.989 milliseconds

With inlining:

julia> jltype_test();
  11.151 milliseconds

@timholy timholy added the performance Must go faster label Jul 15, 2015
@timholy
Copy link
Sponsor Member

timholy commented Jul 15, 2015

@vtjnash's proposal would help a lot with #9622.

@KristofferC
Copy link
Sponsor Member

This looks fixed now:

julia> @code_native f()
	.text
Filename: REPL[25]
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 1
	movq	$1, (%rdi)
	movq	$2, 8(%rdi)
	movq	$3, 16(%rdi)
	movq	$4, 24(%rdi)
	movq	$5, 32(%rdi)
	movq	$6, 40(%rdi)
	movq	$7, 48(%rdi)
	movq	$8, 56(%rdi)
	movq	$9, 64(%rdi)
	movq	%rdi, %rax
	popq	%rbp
	retq
julia> @code_native h5type(Float32)
	.text
Filename: REPL[30]
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 1
	movb	$0, (%rdi)
	movb	$32, 1(%rdi)
	movb	$31, 2(%rdi)
	movb	$0, 3(%rdi)
	movl	$4, 4(%rdi)
	movw	$0, 8(%rdi)
	movw	$32, 10(%rdi)
	movb	$23, 12(%rdi)
	movb	$8, 13(%rdi)
	movb	$0, 14(%rdi)
	movb	$23, 15(%rdi)
	movl	$127, 16(%rdi)
	movq	%rdi, %rax
	popq	%rbp
	retq
	nopw	%cs:(%rax,%rax)
julia> f() = h5type(Float32) == h5type(Float16)
f (generic function with 1 method)

julia> @code_native f()
	.text
Filename: REPL[15]
	pushq	%rbp
	movq	%rsp, %rbp
Source line: 1
	xorl	%eax, %eax
	popq	%rbp
	retq
	nopl	(%rax,%rax)

@tkelman tkelman added the potential benchmark Could make a good benchmark in BaseBenchmarks label Jan 22, 2017
@KristofferC KristofferC removed the potential benchmark Could make a good benchmark in BaseBenchmarks label Oct 8, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
compiler:codegen Generation of LLVM IR and native code performance Must go faster
Projects
None yet
Development

No branches or pull requests

7 participants