Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Portable math functions don't get inlined #523

Closed
CryZe opened this issue Jul 13, 2018 · 9 comments
Closed

Portable math functions don't get inlined #523

CryZe opened this issue Jul 13, 2018 · 9 comments
Labels

Comments

@CryZe
Copy link
Contributor

CryZe commented Jul 13, 2018

#![feature(stdsimd)]

use std::simd::f32x2;

pub fn fma(a: f32x2, b: f32x2, c: f32x2) -> f32x2 {
    a.fma(b, c)
}

pub fn abs(a: f32x2) -> f32x2 {
    a.abs()
}

pub fn sin(a: f32x2) -> f32x2 {
    a.sin()
}

pub fn sqrt(a: f32x2) -> f32x2 {
    a.sqrt()
}

Gets compiled to this:

example::fma:
  push rbx
  sub rsp, 32
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov rdx, qword ptr [rdx]
  mov rcx, qword ptr [rcx]
  mov qword ptr [rsp + 8], rax
  mov qword ptr [rsp + 16], rdx
  mov qword ptr [rsp + 24], rcx
  lea rsi, [rsp + 8]
  lea rdx, [rsp + 16]
  lea rcx, [rsp + 24]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::fma::FloatFma>::fma@PLT
  mov rax, rbx
  add rsp, 32
  pop rbx
  ret

example::abs:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::abs::FloatAbs>::abs@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

example::sin:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::sin::FloatSin>::sin@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

example::sqrt:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::sqrt::FloatSqrt>::sqrt@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

Godbolt

I don't think this is intended.

@lu-zero
Copy link
Contributor

lu-zero commented Jul 13, 2018

Looks like #449 is not limited to PowerPC.

@lu-zero
Copy link
Contributor

lu-zero commented Jul 13, 2018

26edfc2 might help (extend it to the other instruction sets)

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 16, 2018

This (https://godbolt.org/g/SQnNfN)

#![feature(stdsimd)]

use std::simd::f32x2;
pub fn sin(a: f32x2) -> f32x2 {
    a.sin()
}

produces the following LLVM IR:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_ZN7example3sin17h9c036eb2ab1eadaaE(<2 x float>* noalias nocapture sret dereferenceable(8), <2 x float>* noalias nocapture readonly dereferenceable(8) %a) unnamed_addr #0 {
  %arg.i = alloca <2 x float>, align 8
  %1 = bitcast <2 x float>* %a to i64*
  %2 = load i64, i64* %1, align 8
  %3 = bitcast <2 x float>* %arg.i to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3)
  %4 = bitcast <2 x float>* %arg.i to i64*
  store i64 %2, i64* %4, align 8, !noalias !0
  call void @"_ZN97_$LT$core..coresimd..ppsv..v64..f32x2$u20$as$u20$core..coresimd..ppsv..codegen..sin..FloatSin$GT$3sin17hae53fde8f08ed2edE"(<2 x float>* noalias nocapture nonnull sret dereferenceable(8) %0, <2 x float>* noalias nocapture nonnull dereferenceable(8) %arg.i) #2, !noalias !4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3)
  ret void
}

declare void @"_ZN97_$LT$core..coresimd..ppsv..v64..f32x2$u20$as$u20$core..coresimd..ppsv..codegen..sin..FloatSin$GT$3sin17hae53fde8f08ed2edE"(<2 x float>* noalias nocapture sret dereferenceable(8), <2 x float>* noalias nocapture dereferenceable(8)) unnamed_addr #0

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1

attributes #0 = { nounwind "probe-stack"="__rust_probestack" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { nounwind }

!0 = !{!1, !3}
!1 = distinct !{!1, !2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE: argument 0"}
!2 = distinct !{!2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE"}
!3 = distinct !{!3, !2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE: %self"}
!4 = !{!3}

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 16, 2018

cc @rkruppe @alexcrichton any idea why this might not be getting inlined?

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 16, 2018

Probably because we are missing #[inline] here: https://github.com/rust-lang-nursery/stdsimd/blob/master/coresimd/ppsv/codegen/sin.rs#L52 (and in similar places). Still.. that godbolt link is using -C lto=fat... this should get inlined even if its not marked with #[inline]...

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 16, 2018

@CryZe can you try to reproduce with https://github.com/gnzlbg/ppv upstream? The problem should be fixed there.

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 16, 2018

FWIW, there is a new clippy lint (https://rust-lang-nursery.github.io/rust-clippy/master/index.html#missing_inline_in_public_items) that detects whether a function, trait method impl, etc. is not marked with #[inline] or #[inline(always)], maybe we should enable it for stdsimd (and ppv) since most of the methods here should be marked with the attribute. That should prevent issues like this from re-appearing in the future.

@ralfbiedert
Copy link

ralfbiedert commented Jul 18, 2018

I noticed the same thing looking into some faster intrinsics, e.g., abs() (issue 51).

Currently ppsv::abs() and friends take precedence over faster's built-in math methods.

That leads to the paradoxical situation that scalar math (e.g., &f.iter().map(|v| v.abs()) is auto-vectorized and inlined:

     unsafe { intrinsics::fabsf32(self) } (libstd/f32.rs:170)
     andps   xmm1, xmm0
     andps   xmm2, xmm0

While explicitly vectorized code &f.simd_iter(f32s(0.0)).simd_map(|v| v.abs() ) contains method calls and is 8x slower:

     FloatAbs::abs(self) (libcore/../stdsimd/coresimd/ppsv/api/float_math.rs:10)
     movaps  xmmword, ptr, [rbp, -, 80], xmm0
     mov     rdi, r14
     mov     rsi, r15
     call    <core::coresimd::ppsv::v128::f32x4 as core::coresimd::ppsv::codegen::abs::FloatAbs>::abs

@gnzlbg
Copy link
Contributor

gnzlbg commented Jul 18, 2018

This has been fixed upstream.

@gnzlbg gnzlbg closed this as completed Jul 18, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

4 participants