extern "platform-intrinsics" float functions often call libm #76

workingjubilee · 2021-02-17T04:04:45Z

I tried this code (Godbolt):

Rust Code

#![no_std]
#![allow(non_camel_case_types)]
#![feature(repr_simd, platform_intrinsics)]

#[repr(simd)]
#[derive(Debug)]
pub struct f32x4(f32, f32, f32, f32);

extern "platform-intrinsic" {
    fn simd_fsqrt<T>(x: T) -> T;
    fn simd_fabs<T>(x: T) -> T;
    fn simd_fsin<T>(x: T) -> T;
    fn simd_fcos<T>(x: T) -> T;
    fn simd_ceil<T>(x: T) -> T;
    fn simd_fexp<T>(x: T) -> T;
    fn simd_fexp2<T>(x: T) -> T;
    fn simd_floor<T>(x: T) -> T;
    fn simd_fma<T>(x: T, y: T, z: T) -> T;
    fn simd_flog<T>(x: T) -> T;
    fn simd_flog10<T>(x: T) -> T;
    fn simd_flog2<T>(x: T) -> T;
    fn simd_fpow<T>(x: T, y: T) -> T;
    fn simd_fpowi<T>(x: T, y: i32) -> T;
    fn simd_trunc<T>(x: T) -> T;
    fn simd_round<T>(x: T) -> T;
}

impl f32x4 {
    // Rounding
    pub fn ceil(self) -> Self {
        unsafe { simd_ceil(self) }
    }
    pub fn floor(self) -> Self {
        unsafe { simd_floor(self) }
    }
    pub fn round(self) -> Self {
        unsafe { simd_round(self) }
    }
    pub fn trunc(self) -> Self {
        unsafe { simd_trunc(self) }
    }

    // Arithmetic
    pub fn mul_add(self, y: Self, z: Self) -> Self {
        unsafe { simd_fma(self, y, z) }
    }
    pub fn abs(self) -> Self {
        unsafe { simd_fabs(self) }
    }
    pub fn sqrt(self) -> Self {
        unsafe { simd_fsqrt(self) }
    }
    pub fn powi(self, exp: i32) -> Self {
        unsafe { simd_fpowi(self, exp) }
    }
    pub fn powf(self, exp: Self) -> Self {
        unsafe { simd_fpow(self, exp) }
    }

    // Calculus
    pub fn flog2(self) -> Self {
        unsafe { simd_flog2(self) }
    }
    pub fn flog10(self) -> Self {
        unsafe { simd_flog10(self) }
    }
    pub fn flog(self) -> Self {
        unsafe { simd_flog(self) }
    }
    pub fn fexp(self) -> Self {
        unsafe { simd_fexp(self) }
    }
    pub fn fexp2(self) -> Self {
        unsafe { simd_fexp2(self) }
    }

    // Trigonometry
    pub fn cos(self) -> Self {
        unsafe { simd_fcos(self) }
    }
    pub fn sin(self) -> Self {
        unsafe { simd_fsin(self) }
    }
}

I expected to see this happen: Compilations to "pure assembly".

Instead, this happened: Mostly compiled to calls to libm!

When sufficient vector features are enabled, these do compile to vectorized assembly instructions. However, the problem is that compilation without those features enabled means code that depends on libm... which is not allowed in core. We are going to have to either solve this or push our implementation of SimdF32 and SimdF64 mostly into std, not core.

Notable winners on x64: simd_fsqrt, simd_fabs become vector instructions just fine. I'm worried about them on x86_32 or Arm architectures, though.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (d1206f950 2021-02-15)
binary: rustc
commit-hash: d1206f950ffb76c76e1b74a19ae33c2b7d949454
commit-date: 2021-02-15
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1

x86 Assembly

<&T as core::fmt::Debug>::fmt:
        movq    (%rdi), %rdi
        jmpq    *_ZN4core3fmt5float50_$LT$impl$u20$core..fmt..Debug$u20$for$u20$f32$GT$3fmt17hf2084266ae57b528E@GOTPCREL(%rip)

core::ptr::drop_in_place<&f32>:
        retq

example::f32x4::ceil:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    ceilf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::floor:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    floorf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::round:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    roundf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::trunc:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    truncf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::mul_add:
        movaps  (%rsi), %xmm0
        mulps   (%rdx), %xmm0
        movq    %rdi, %rax
        addps   (%rcx), %xmm0
        movaps  %xmm0, (%rdi)
        retq

.LCPI7_0:
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
example::f32x4::abs:
        movq    %rdi, %rax
        movaps  (%rsi), %xmm0
        andps   .LCPI7_0(%rip), %xmm0
        movaps  %xmm0, (%rdi)
        retq

.LCPI8_0:
        .long   0xbf000000
        .long   0xbf000000
        .long   0xbf000000
        .long   0xbf000000
.LCPI8_1:
        .long   0xc0400000
        .long   0xc0400000
        .long   0xc0400000
        .long   0xc0400000
.LCPI8_2:
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
.LCPI8_3:
        .long   0x00800000
        .long   0x00800000
        .long   0x00800000
        .long   0x00800000
example::f32x4::sqrt:
        movaps  (%rsi), %xmm0
        rsqrtps %xmm0, %xmm1
        movaps  %xmm0, %xmm2
        mulps   %xmm1, %xmm2
        movaps  .LCPI8_0(%rip), %xmm3
        mulps   %xmm2, %xmm3
        mulps   %xmm1, %xmm2
        addps   .LCPI8_1(%rip), %xmm2
        movq    %rdi, %rax
        mulps   %xmm3, %xmm2
        andps   .LCPI8_2(%rip), %xmm0
        movaps  .LCPI8_3(%rip), %xmm1
        cmpleps %xmm0, %xmm1
        andps   %xmm2, %xmm1
        movaps  %xmm1, (%rdi)
        retq

example::f32x4::powi:
        pushq   %rbp
        pushq   %r14
        pushq   %rbx
        subq    $48, %rsp
        movl    %edx, %ebp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    __powisf2@GOTPCREL(%rip), %rbx
        movl    %edx, %edi
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $48, %rsp
        popq    %rbx
        popq    %r14
        popq    %rbp
        retq

example::f32x4::powf:
        pushq   %r14
        pushq   %rbx
        subq    $72, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 32(%rsp)
        movaps  (%rdx), %xmm1
        movaps  %xmm1, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        shufps  $255, %xmm1, %xmm1
        movq    powf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  32(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        movhlps %xmm1, %xmm1
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  32(%rsp), %xmm0
        movaps  16(%rsp), %xmm1
        callq   *%rbx
        movaps  %xmm0, 48(%rsp)
        movaps  32(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        shufps  $85, %xmm1, %xmm1
        callq   *%rbx
        movaps  48(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $72, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog2:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    log2f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog10:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    log10f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    logf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::fexp:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    expf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::fexp2:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    exp2f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::cos:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    cosf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::sin:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    sinf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

<example::f32x4 as core::fmt::Debug>::fmt:
        pushq   %rbp
        pushq   %r15
        pushq   %r14
        pushq   %r13
        pushq   %r12
        pushq   %rbx
        subq    $40, %rsp
        movq    %rdi, %rbx
        leaq    4(%rdi), %r12
        leaq    8(%rdi), %r13
        leaq    12(%rdi), %r15
        leaq    .L__unnamed_1(%rip), %rdx
        leaq    16(%rsp), %r14
        movl    $5, %ecx
        movq    %r14, %rdi
        callq   *core::fmt::Formatter::debug_tuple@GOTPCREL(%rip)
        movq    %rbx, 8(%rsp)
        leaq    .L__unnamed_2(%rip), %rbx
        movq    core::fmt::builders::DebugTuple::field@GOTPCREL(%rip), %rbp
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r12, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r13, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r15, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r14, %rdi
        callq   *core::fmt::builders::DebugTuple::finish@GOTPCREL(%rip)
        addq    $40, %rsp
        popq    %rbx
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %r15
        popq    %rbp
        retq

.L__unnamed_1:
        .ascii  "f32x4"

.L__unnamed_2:
        .quad   core::ptr::drop_in_place<&f32>
        .quad   8
        .quad   8
        .quad   <&T as core::fmt::Debug>::fmt

AArch64 Assembly

<&T as core::fmt::Debug>::fmt:
        ldr     x0, [x0]
        b       _ZN4core3fmt5float50_$LT$impl$u20$core..fmt..Debug$u20$for$u20$f32$GT$3fmt17h68f66863527610f0E

core::ptr::drop_in_place<&f32>:
        ret

example::f32x4::ceil:
        ldr     q0, [x0]
        frintp  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::floor:
        ldr     q0, [x0]
        frintm  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::round:
        ldr     q0, [x0]
        frinta  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::trunc:
        ldr     q0, [x0]
        frintz  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::mul_add:
        ldr     q0, [x0]
        ldr     q1, [x1]
        ldr     q2, [x2]
        fmla    v2.4s, v1.4s, v0.4s
        str     q2, [x8]
        ret

example::f32x4::abs:
        ldr     q0, [x0]
        fabs    v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::sqrt:
        ldr     q0, [x0]
        fsqrt   v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::powi:
        sub     sp, sp, #64
        str     x30, [sp, #32]
        stp     x20, x19, [sp, #48]
        ldr     q0, [x0]
        mov     w0, w1
        mov     w19, w1
        mov     x20, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      __powisf2
        str     d0, [sp]
        ldr     q0, [sp, #16]
        mov     w0, w19
        bl      __powisf2
        ldr     q1, [sp]
        mov     w0, w19
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      __powisf2
        ldr     q1, [sp]
        mov     w0, w19
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      __powisf2
        ldr     q1, [sp]
        ldr     x30, [sp, #32]
        mov     v1.s[3], v0.s[0]
        str     q1, [x20]
        ldp     x20, x19, [sp, #48]
        add     sp, sp, #64
        ret

example::f32x4::powf:
        sub     sp, sp, #64
        stp     x30, x19, [sp, #48]
        ldr     q0, [x0]
        ldr     q1, [x1]
        mov     x19, x8
        stp     q1, q0, [sp, #16]
        mov     s0, v0.s[1]
        mov     s1, v1.s[1]
        bl      powf
        str     d0, [sp]
        ldp     q1, q0, [sp, #16]
        bl      powf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldp     q1, q0, [sp, #16]
        mov     s0, v0.s[2]
        mov     s1, v1.s[2]
        bl      powf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        str     q1, [sp]
        ldp     q1, q0, [sp, #16]
        mov     s0, v0.s[3]
        mov     s1, v1.s[3]
        bl      powf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #48]
        add     sp, sp, #64
        ret

example::f32x4::flog2:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      log2f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      log2f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      log2f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      log2f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::flog10:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      log10f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      log10f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      log10f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      log10f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::flog:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      logf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      logf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      logf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      logf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::fexp:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      expf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      expf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      expf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      expf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::fexp2:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      exp2f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      exp2f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      exp2f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      exp2f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::cos:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      cosf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      cosf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      cosf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      cosf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::sin:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      sinf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      sinf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      sinf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      sinf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

<example::f32x4 as core::fmt::Debug>::fmt:
        sub     sp, sp, #80
        str     x30, [sp, #32]
        stp     x22, x21, [sp, #48]
        stp     x20, x19, [sp, #64]
        mov     x9, x1
        adrp    x1, .L__unnamed_1
        mov     x19, x0
        add     x20, x0, #4
        add     x21, x0, #8
        add     x22, x0, #12
        add     x1, x1, :lo12:.L__unnamed_1
        add     x8, sp, #8
        mov     w2, #5
        mov     x0, x9
        bl      core::fmt::Formatter::debug_tuple
        str     x19, [sp, #40]
        adrp    x19, .L__unnamed_2
        add     x19, x19, :lo12:.L__unnamed_2
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x20, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x21, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x22, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        bl      core::fmt::builders::DebugTuple::finish
        ldp     x20, x19, [sp, #64]
        ldp     x22, x21, [sp, #48]
        ldr     x30, [sp, #32]
        add     sp, sp, #80
        ret

.L__unnamed_1:
        .ascii  "f32x4"

.L__unnamed_2:
        .xword  core::ptr::drop_in_place<&f32>
        .xword  8
        .xword  8
        .xword  <&T as core::fmt::Debug>::fmt

The text was updated successfully, but these errors were encountered:

Lokathor · 2021-02-17T04:30:37Z

(I wonder if any of the libm maintainers have an opinion on this.)

Hi, I'm [accidentally] a libm maintainer [with absolutely no special training], how can I help?

Well first of all, sqrt is available in hardware on ARM, so there's that.
Some of these can be done by hand on our end of things.
In fact all of them we could probably be doing ourselves.

Basically, floats have been kept out of core for ages, and I'm increasingly sick of it.

The only reason that we don't just put more libm stuff directly into core is that it might be less optimal than the system's local llibm, so we want to try to link to the system's libm when it's available.

However, none of libm is SIMD aware in the first place. The local libm will never beat our code because it can't even do the operation. So in the case of SIMD we can just always use our version of a given function. most of the time Because if there's an op we can't implement ourselves for all platforms then that'll be trouble.

workingjubilee · 2021-02-22T02:26:19Z

A lot of discussion happened about this. We concluded this is not easily fixable in one go.

Some things that came up:

Patching LLVM's codegen to use SIMD even when there's not an "obvious" 1-step instruction to use
Using LLVM's support for a libmvec, but write ours in Rust
Multiversioning std
Shipping multiple compiled stdlibs
Finishing and using -Zbuild-std

rust-lang/rust#64609 is one of the partial blockers to many solutions.

workingjubilee · 2021-04-17T02:07:50Z

I have updated this with a recent version.

Most dramatic change:

diff --git a/src/unopt/simd_libm.asm b/src/unopt/simd_libm.asm
index dc84af2..91ad1d5 100644
--- a/src/unopt/simd_libm.asm
+++ b/src/unopt/simd_libm.asm
@@ -1,9 +1,9 @@
-; 2021-02-16
+; 2021-04-16
 ; https://github.com/rust-lang/stdsimd/issues/76
 
 example::f32x4::mul_add:
-        pushq   %r14
-        pushq   %rbx
-        subq    $88, %rsp
-        movq    %rdi, %r14
         movaps  (%rsi), %xmm0
-        movaps  %xmm0, 48(%rsp)
-        movaps  (%rdx), %xmm1
-        movaps  %xmm1, 16(%rsp)
-        movaps  (%rcx), %xmm3
-        movaps  %xmm3, 32(%rsp)
-        shufps  $231, %xmm0, %xmm0
-        shufps  $231, %xmm1, %xmm1
-        movaps  %xmm3, %xmm2
-        shufps  $231, %xmm3, %xmm2
-        movq    fmaf@GOTPCREL(%rip), %rbx
-        callq   *%rbx
-        movaps  %xmm0, (%rsp)
-        movaps  48(%rsp), %xmm0
-        movhlps %xmm0, %xmm0
-        movaps  16(%rsp), %xmm1
-        movhlps %xmm1, %xmm1
-        movaps  32(%rsp), %xmm2
-        movhlps %xmm2, %xmm2
-        callq   *%rbx
-        unpcklps        (%rsp), %xmm0
-        movaps  %xmm0, (%rsp)
-        movaps  48(%rsp), %xmm0
-        movaps  16(%rsp), %xmm1
-        movaps  32(%rsp), %xmm2
-        callq   *%rbx
-        movaps  %xmm0, 64(%rsp)
-        movaps  48(%rsp), %xmm0
-        shufps  $229, %xmm0, %xmm0
-        movaps  16(%rsp), %xmm1
-        shufps  $229, %xmm1, %xmm1
-        movaps  32(%rsp), %xmm2
-        shufps  $229, %xmm2, %xmm2
-        callq   *%rbx
-        movaps  64(%rsp), %xmm1
-        unpcklps        %xmm0, %xmm1
-        unpcklpd        (%rsp), %xmm1
-        movaps  %xmm1, (%r14)
-        movq    %r14, %rax
-        addq    $88, %rsp
-        popq    %rbx
-        popq    %r14
+        mulps   (%rdx), %xmm0
+        movq    %rdi, %rax
+        addps   (%rcx), %xmm0
+        movaps  %xmm0, (%rdi)
         retq

simd_fma has just disappeared, effectively.

workingjubilee · 2021-04-25T20:29:36Z

Regressed again, presumably due to rust-lang/rust#84274:

example::f32x4::mul_add:
        pushq   %r14
        pushq   %rbx
        subq    $88, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 48(%rsp)
        movaps  (%rdx), %xmm1
        movaps  %xmm1, 16(%rsp)
        movaps  (%rcx), %xmm3
        movaps  %xmm3, 32(%rsp)
        shufps  $255, %xmm0, %xmm0
        shufps  $255, %xmm1, %xmm1
        movaps  %xmm3, %xmm2
        shufps  $255, %xmm3, %xmm2
        movq    fmaf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  48(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        movhlps %xmm1, %xmm1
        movaps  32(%rsp), %xmm2
        movhlps %xmm2, %xmm2
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  48(%rsp), %xmm0
        movaps  16(%rsp), %xmm1
        movaps  32(%rsp), %xmm2
        callq   *%rbx
        movaps  %xmm0, 64(%rsp)
        movaps  48(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        shufps  $85, %xmm1, %xmm1
        movaps  32(%rsp), %xmm2
        shufps  $85, %xmm2, %xmm2
        callq   *%rbx
        movaps  64(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $88, %rsp
        popq    %rbx
        popq    %r14
        retq

workingjubilee added A-simd Area: SIMD. Put this on tracking issues to help with cross-repo issue organization C-bug Category: Bug A-LLVM Area: LLVM labels Feb 17, 2021

workingjubilee mentioned this issue Apr 17, 2021

Don't set fast-math for the SIMD operations we set it for previously rust-lang/rust#84274

Merged

calebzulawski mentioned this issue Apr 21, 2021

Support for Fused Multiply-Add (FMA) #102

Closed

workingjubilee added the E-needs-design Call for participation: Needs design. label May 3, 2021

miguelraz mentioned this issue May 18, 2021

add simd_fsqrt intrinsic #120

Merged

workingjubilee mentioned this issue Jun 9, 2021

Examples of bad Rust SIMD perf? #135

Open

workingjubilee mentioned this issue Dec 14, 2021

pub use std::simd::Float; rust-lang/rust#91891

Closed

workingjubilee mentioned this issue Dec 22, 2021

impl std::simd::StdFloat #219

Merged

workingjubilee mentioned this issue Mar 9, 2022

Clarify story on libm bindings rust-lang/rust#26350

Open

NamorNiradnug mentioned this issue May 3, 2024

Benchmark against simd math functions from core::intrinsics::simd NamorNiradnug/portable-simd-addons#3

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

extern "platform-intrinsics" float functions often call libm #76

extern "platform-intrinsics" float functions often call libm #76

workingjubilee commented Feb 17, 2021 •

edited

Loading

Lokathor commented Feb 17, 2021

workingjubilee commented Feb 22, 2021

workingjubilee commented Apr 17, 2021

workingjubilee commented Apr 25, 2021 •

edited

Loading

extern "platform-intrinsics" float functions often call libm #76

extern "platform-intrinsics" float functions often call libm #76

Comments

workingjubilee commented Feb 17, 2021 • edited Loading

Meta

Lokathor commented Feb 17, 2021

workingjubilee commented Feb 22, 2021

workingjubilee commented Apr 17, 2021

workingjubilee commented Apr 25, 2021 • edited Loading

workingjubilee commented Feb 17, 2021 •

edited

Loading

workingjubilee commented Apr 25, 2021 •

edited

Loading