-
Notifications
You must be signed in to change notification settings - Fork 132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
exp(1.0) not correctly rounded #24
Comments
I thought that this could be easily fixed by tweaking the coefficients, but no success at this time. I will continue trying, but this could take time. |
I tried too but couldn't get any better... BTW I have benchmarked sleef's exp function to openlibm/musm version the sleef version is about 2x slower for small arguments since there is no explicit branch here See: julia> @benchmark Base.exp(0.5)
BenchmarkTools.Trial:
--------------
minimum time: 9.475 ns (0.00% GC)
median time: 10.264 ns (0.00% GC)
mean time: 11.109 ns (0.00% GC)
maximum time: 148.034 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(0.5)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 14.211 ns (0.00% GC)
mean time: 14.394 ns (0.00% GC)
maximum time: 52.108 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(100.0)
BenchmarkTools.Trial:
--------------
minimum time: 11.842 ns (0.00% GC)
median time: 12.238 ns (0.00% GC)
mean time: 13.429 ns (0.00% GC)
maximum time: 187.510 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(100.0)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 13.027 ns (0.00% GC)
mean time: 14.074 ns (0.00% GC)
maximum time: 67.504 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(10.0)
BenchmarkTools.Trial:
--------------
minimum time: 11.842 ns (0.00% GC)
median time: 12.238 ns (0.00% GC)
mean time: 13.005 ns (0.00% GC)
maximum time: 60.003 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(10.0)
BenchmarkTools.Trial:
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 14.211 ns (0.00% GC)
mean time: 14.620 ns (0.00% GC)
maximum time: 115.270 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Base.exp(0.001)
BenchmarkTools.Trial:
--------------
minimum time: 7.105 ns (0.00% GC)
median time: 7.501 ns (0.00% GC)
mean time: 8.228 ns (0.00% GC)
maximum time: 72.635 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> @benchmark Sleef.exp(0.001)
--------------
minimum time: 12.632 ns (0.00% GC)
median time: 13.027 ns (0.00% GC)
mean time: 14.288 ns (0.00% GC)
maximum time: 172.904 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000 |
For computation speed, I am only checking the AVX2 version. It's two times as fast as the pure C version. |
This is he native code I am getting for the exp function from sleef pushq %rbp
movq %rsp, %rbp
movabsq $252195272, %rax # imm = 0xF0831C8
vucomisd (%rax), %xmm0
ja L358
vxorpd %xmm1, %xmm1, %xmm1
movabsq $252195280, %rax # imm = 0xF0831D0
vmovsd (%rax), %xmm2 # xmm2 = mem[0],zero
vucomisd %xmm0, %xmm2
ja L372
movabsq $252195288, %rax # imm = 0xF0831D8
vmulsd (%rax), %xmm0, %xmm1
vroundsd $4, %xmm1, %xmm1, %xmm1
vcvttsd2si %xmm1, %rax
vxorps %xmm1, %xmm1, %xmm1
vcvtsi2sdq %rax, %xmm1, %xmm1
movabsq $252195296, %rcx # imm = 0xF0831E0
vfmadd231sd (%rcx), %xmm1, %xmm0
movabsq $252195304, %rcx # imm = 0xF0831E8
vfmadd231sd (%rcx), %xmm1, %xmm0
movabsq $252195312, %rcx # imm = 0xF0831F0
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movabsq $252195320, %rcx # imm = 0xF0831F8
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195328, %rcx # imm = 0xF083200
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195336, %rcx # imm = 0xF083208
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195344, %rcx # imm = 0xF083210
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195352, %rcx # imm = 0xF083218
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195360, %rcx # imm = 0xF083220
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195368, %rcx # imm = 0xF083228
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195376, %rcx # imm = 0xF083230
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195384, %rcx # imm = 0xF083238
vfmadd213sd (%rcx), %xmm0, %xmm1
movabsq $252195392, %rcx # imm = 0xF083240
vfmadd213sd (%rcx), %xmm0, %xmm1
vmulsd %xmm0, %xmm0, %xmm2
vmulsd %xmm1, %xmm2, %xmm1
vaddsd %xmm1, %xmm0, %xmm0
movabsq $252195400, %rcx # imm = 0xF083248
vaddsd (%rcx), %xmm0, %xmm0
movq %rax, %rcx
shrq %rcx
subl %ecx, %eax
shlq $52, %rcx
movabsq $4607182418800017408, %rdx # imm = 0x3FF0000000000000
addq %rdx, %rcx
vmovq %rcx, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
shlq $52, %rax
addq %rdx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
L358:
movabsq $252195264, %rax # imm = 0xF0831C0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
L372:
vmovapd %xmm1, %xmm0
popq %rbp
retq
nopw (%rax,%rax)
note the vectorized instructions |
here is the msum version (also note the vectorized instructions) pushq %rbp
movq %rsp, %rbp
vmovq %xmm0, %rax
movb $63, %cl
bzhiq %rcx, %rax, %rdx
movq %rax, %r8
shrq $63, %r8
movabsq $4649454530587146736, %rcx # imm = 0x40862E42FEFA39F0
cmpq %rcx, %rdx
jb L106
movq %rdx, %rcx
shrq $52, %rcx
cmpq $2047, %rcx # imm = 0x7FF
jae L286
movabsq $252190152, %rax # imm = 0xF081DC8
vucomisd (%rax), %xmm0
ja L417
vxorpd %xmm1, %xmm1, %xmm1
movabsq $252190160, %rax # imm = 0xF081DD0
vmovsd (%rax), %xmm2 # xmm2 = mem[0],zero
vucomisd %xmm0, %xmm2
ja L506
L106:
movabsq $252190176, %rax # imm = 0xF081DE0
vucomisd (%rax), %xmm0
jne L128
jnp L324
L128:
movabsq $4599914934686071280, %rcx # imm = 0x3FD62E42FEFA39F0
cmpq %rcx, %rdx
jae L343
shrq $52, %rdx
cmpq $994, %rdx # imm = 0x3E2
jbe L444
vmulsd %xmm0, %xmm0, %xmm1
movabsq $252190224, %rcx # imm = 0xF081E10
vmovsd dcabs164_(%rcx), %xmm2 # xmm2 = mem[0],zero
movabsq $252190232, %rcx # imm = 0xF081E18
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190240, %rcx # imm = 0xF081E20
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190248, %rcx # imm = 0xF081E28
vfmadd213sd (%rcx), %xmm1, %xmm2
movabsq $252190256, %rcx # imm = 0xF081E30
vfmadd213sd (%rcx), %xmm1, %xmm2
vmulsd %xmm2, %xmm1, %xmm1
vsubsd %xmm1, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm2
movabsq $252190288, %rcx # imm = 0xF081E50
vaddsd (%rcx), %xmm1, %xmm1
vdivsd %xmm1, %xmm2, %xmm1
vsubsd %xmm0, %xmm1, %xmm0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
L286:
movabsq $4503599627370495, %rcx # imm = 0xFFFFFFFFFFFFF
testq %rcx, %rax
je L433
movabsq $252190136, %rax # imm = 0xF081DB8
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L324:
movabsq $252190168, %rax # imm = 0xF081DD8
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L343:
movabsq $4607361305248770930, %rcx # imm = 0x3FF0A2B23F3BAB72
cmpq %rcx, %rdx
jbe L450
movabsq $252190216, %rcx # imm = 0xF081E08
vmulsd (%rcx), %xmm0, %xmm1
vroundsd $4, %xmm1, %xmm1, %xmm1
vcvttsd2si %xmm1, %rcx
movabsq $252190200, %rdx # imm = 0xF081DF8
vfmadd231sd (%rdx), %xmm1, %xmm0
movabsq $252190208, %rdx # imm = 0xF081E00
vmulsd (%rdx), %xmm1, %xmm1
jmp L545
L417:
movabsq $252190144, %rax # imm = 0xF081DC0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
jmp L506
L433:
testq %r8, %r8
je L492
vxorpd %xmm1, %xmm1, %xmm1
jmp L506
L444:
vaddsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L450:
testq %r8, %r8
je L512
movabsq $252190184, %rcx # imm = 0xF081DE8
vaddsd (%rcx), %xmm0, %xmm0
movq $-1, %rcx
movabsq $252190192, %rdx # imm = 0xF081DF0
vmovsd (%rdx), %xmm1 # xmm1 = mem[0],zero
jmp L545
L492:
movabsq $252190144, %rax # imm = 0xF081DC0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
L506:
vmovapd %xmm1, %xmm0
popq %rbp
retq
L512:
movabsq $252190200, %rcx # imm = 0xF081DF8
vaddsd (%rcx), %xmm0, %xmm0
movabsq $252190208, %rcx # imm = 0xF081E00
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
movl $1, %ecx
L545:
vsubsd %xmm1, %xmm0, %xmm2
vmulsd %xmm2, %xmm2, %xmm3
movabsq $252190224, %rdx # imm = 0xF081E10
vmovsd (%rdx), %xmm4 # xmm4 = mem[0],zero
movabsq $252190232, %rdx # imm = 0xF081E18
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190240, %rdx # imm = 0xF081E20
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190248, %rdx # imm = 0xF081E28
vfmadd213sd (%rdx), %xmm3, %xmm4
movabsq $252190256, %rdx # imm = 0xF081E30
vfmadd213sd (%rdx), %xmm3, %xmm4
vmulsd %xmm4, %xmm3, %xmm3
vsubsd %xmm3, %xmm2, %xmm3
vmulsd %xmm3, %xmm2, %xmm2
movabsq $252190264, %rdx # imm = 0xF081E38
vmovsd (%rdx), %xmm4 # xmm4 = mem[0],zero
vsubsd %xmm3, %xmm4, %xmm3
vdivsd %xmm3, %xmm2, %xmm2
vsubsd %xmm2, %xmm1, %xmm1
vsubsd %xmm0, %xmm1, %xmm0
vmovsd (%rax), %xmm1 # xmm1 = mem[0],zero
vsubsd %xmm0, %xmm1, %xmm0
cmpq $-51, %rcx
jge L725
shlq $52, %rcx
movabsq $4845873199050653696, %rax # imm = 0x4340000000000000
addq %rcx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
movabsq $252190280, %rax # imm = 0xF081E48
vmulsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L725:
cmpq $1024, %rcx # imm = 0x400
jne L754
vaddsd %xmm0, %xmm0, %xmm0
movabsq $252190272, %rax # imm = 0xF081E40
vmulsd (%rax), %xmm0, %xmm0
popq %rbp
retq
L754:
shlq $52, %rcx
movabsq $4607182418800017408, %rax # imm = 0x3FF0000000000000
addq %rcx, %rax
vmovq %rax, %xmm1
vmulsd %xmm0, %xmm1, %xmm0
popq %rbp
retq
nop |
Please compare with the AVX2 implementation. The pure C version is not much optimized. The primary reason I am providing the pure C implementation is for better understanding of the algorithm. |
For the value of exp(1.0), it is hard due to the following reason. exp can be calculated with 1 ulp accuracy without DD calculation, and this is because the last three coefficients are 1, 1, and 0.5. However, if I try to move the calculated value of exp(1.0) to the correctly rounded value, I need to change those three coefficients slightly. This worsens the accuracy, and it seems hard to achieve 1 ulp accuracy in this case. |
fma only. EXPORT CONST double xexp(double d) { s = mla(q, -L2U, d); u = +0.2081276378237164457e-8; u = ldexp2k(u, q); if (d > 709.78271114955742909217217426) u = INFINITY; return u; |
nice, will we see this in the next update? |
It is already incorporated in the SIMD version. I'm still thinking if it is better to add it to the pure C version since I will have problems in testing it. |
is this fixed in the pure C version? |
It is fixed if you use purecfma version which is not yet available in the released version. |
I think this can be closed now |
Yes |
For double precision sleef gives
2.7182818284590455
instead of
2.718281828459045
despite the fact that the sleef answer is within 1 ulp, since it is such an important number it would be good that exp(1.0) was correctly rounded
The text was updated successfully, but these errors were encountered: