Portable math functions don't get inlined #523

CryZe · 2018-07-13T00:47:57Z

#![feature(stdsimd)]

use std::simd::f32x2;

pub fn fma(a: f32x2, b: f32x2, c: f32x2) -> f32x2 {
    a.fma(b, c)
}

pub fn abs(a: f32x2) -> f32x2 {
    a.abs()
}

pub fn sin(a: f32x2) -> f32x2 {
    a.sin()
}

pub fn sqrt(a: f32x2) -> f32x2 {
    a.sqrt()
}

Gets compiled to this:

example::fma:
  push rbx
  sub rsp, 32
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov rdx, qword ptr [rdx]
  mov rcx, qword ptr [rcx]
  mov qword ptr [rsp + 8], rax
  mov qword ptr [rsp + 16], rdx
  mov qword ptr [rsp + 24], rcx
  lea rsi, [rsp + 8]
  lea rdx, [rsp + 16]
  lea rcx, [rsp + 24]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::fma::FloatFma>::fma@PLT
  mov rax, rbx
  add rsp, 32
  pop rbx
  ret

example::abs:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::abs::FloatAbs>::abs@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

example::sin:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::sin::FloatSin>::sin@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

example::sqrt:
  push rbx
  sub rsp, 16
  mov rbx, rdi
  mov rax, qword ptr [rsi]
  mov qword ptr [rsp + 8], rax
  lea rsi, [rsp + 8]
  call <core::coresimd::ppsv::v64::f32x2 as core::coresimd::ppsv::codegen::sqrt::FloatSqrt>::sqrt@PLT
  mov rax, rbx
  add rsp, 16
  pop rbx
  ret

Godbolt

I don't think this is intended.

lu-zero · 2018-07-13T01:11:49Z

Looks like #449 is not limited to PowerPC.

lu-zero · 2018-07-13T01:30:44Z

26edfc2 might help (extend it to the other instruction sets)

gnzlbg · 2018-07-16T15:19:43Z

This (https://godbolt.org/g/SQnNfN)

#![feature(stdsimd)]

use std::simd::f32x2;
pub fn sin(a: f32x2) -> f32x2 {
    a.sin()
}

produces the following LLVM IR:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @_ZN7example3sin17h9c036eb2ab1eadaaE(<2 x float>* noalias nocapture sret dereferenceable(8), <2 x float>* noalias nocapture readonly dereferenceable(8) %a) unnamed_addr #0 {
  %arg.i = alloca <2 x float>, align 8
  %1 = bitcast <2 x float>* %a to i64*
  %2 = load i64, i64* %1, align 8
  %3 = bitcast <2 x float>* %arg.i to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3)
  %4 = bitcast <2 x float>* %arg.i to i64*
  store i64 %2, i64* %4, align 8, !noalias !0
  call void @"_ZN97_$LT$core..coresimd..ppsv..v64..f32x2$u20$as$u20$core..coresimd..ppsv..codegen..sin..FloatSin$GT$3sin17hae53fde8f08ed2edE"(<2 x float>* noalias nocapture nonnull sret dereferenceable(8) %0, <2 x float>* noalias nocapture nonnull dereferenceable(8) %arg.i) #2, !noalias !4
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3)
  ret void
}

declare void @"_ZN97_$LT$core..coresimd..ppsv..v64..f32x2$u20$as$u20$core..coresimd..ppsv..codegen..sin..FloatSin$GT$3sin17hae53fde8f08ed2edE"(<2 x float>* noalias nocapture sret dereferenceable(8), <2 x float>* noalias nocapture dereferenceable(8)) unnamed_addr #0

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1

attributes #0 = { nounwind "probe-stack"="__rust_probestack" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { nounwind }

!0 = !{!1, !3}
!1 = distinct !{!1, !2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE: argument 0"}
!2 = distinct !{!2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE"}
!3 = distinct !{!3, !2, !"_ZN4core8coresimd4ppsv3v645f32x23sin17h532ec7bf70ae1d2dE: %self"}
!4 = !{!3}

gnzlbg · 2018-07-16T15:20:25Z

cc @rkruppe @alexcrichton any idea why this might not be getting inlined?

gnzlbg · 2018-07-16T15:21:22Z

Probably because we are missing #[inline] here: https://github.com/rust-lang-nursery/stdsimd/blob/master/coresimd/ppsv/codegen/sin.rs#L52 (and in similar places). Still.. that godbolt link is using -C lto=fat... this should get inlined even if its not marked with #[inline]...

gnzlbg · 2018-07-16T15:22:35Z

@CryZe can you try to reproduce with https://github.com/gnzlbg/ppv upstream? The problem should be fixed there.

gnzlbg · 2018-07-16T15:24:23Z

FWIW, there is a new clippy lint (https://rust-lang-nursery.github.io/rust-clippy/master/index.html#missing_inline_in_public_items) that detects whether a function, trait method impl, etc. is not marked with #[inline] or #[inline(always)], maybe we should enable it for stdsimd (and ppv) since most of the methods here should be marked with the attribute. That should prevent issues like this from re-appearing in the future.

ralfbiedert · 2018-07-18T18:58:27Z

I noticed the same thing looking into some faster intrinsics, e.g., abs() (issue 51).

Currently ppsv::abs() and friends take precedence over faster's built-in math methods.

That leads to the paradoxical situation that scalar math (e.g., &f.iter().map(|v| v.abs()) is auto-vectorized and inlined:

     unsafe { intrinsics::fabsf32(self) } (libstd/f32.rs:170)
     andps   xmm1, xmm0
     andps   xmm2, xmm0

While explicitly vectorized code &f.simd_iter(f32s(0.0)).simd_map(|v| v.abs() ) contains method calls and is 8x slower:

     FloatAbs::abs(self) (libcore/../stdsimd/coresimd/ppsv/api/float_math.rs:10)
     movaps  xmmword, ptr, [rbp, -, 80], xmm0
     mov     rdi, r14
     mov     rsi, r15
     call    <core::coresimd::ppsv::v128::f32x4 as core::coresimd::ppsv::codegen::abs::FloatAbs>::abs

gnzlbg · 2018-07-18T23:54:44Z

This has been fixed upstream.

lu-zero added bug A-portable labels Jul 13, 2018

gnzlbg closed this as completed Jul 18, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Portable math functions don't get inlined #523

Portable math functions don't get inlined #523

CryZe commented Jul 13, 2018

lu-zero commented Jul 13, 2018

lu-zero commented Jul 13, 2018 •

edited

Loading

gnzlbg commented Jul 16, 2018

gnzlbg commented Jul 16, 2018

gnzlbg commented Jul 16, 2018 •

edited

Loading

gnzlbg commented Jul 16, 2018 •

edited

Loading

gnzlbg commented Jul 16, 2018

ralfbiedert commented Jul 18, 2018 •

edited

Loading

gnzlbg commented Jul 18, 2018

Portable math functions don't get inlined #523

Portable math functions don't get inlined #523

Comments

CryZe commented Jul 13, 2018

lu-zero commented Jul 13, 2018

lu-zero commented Jul 13, 2018 • edited Loading

gnzlbg commented Jul 16, 2018

gnzlbg commented Jul 16, 2018

gnzlbg commented Jul 16, 2018 • edited Loading

gnzlbg commented Jul 16, 2018 • edited Loading

gnzlbg commented Jul 16, 2018

ralfbiedert commented Jul 18, 2018 • edited Loading

gnzlbg commented Jul 18, 2018

lu-zero commented Jul 13, 2018 •

edited

Loading

gnzlbg commented Jul 16, 2018 •

edited

Loading

gnzlbg commented Jul 16, 2018 •

edited

Loading

ralfbiedert commented Jul 18, 2018 •

edited

Loading