Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arctan avx512 #759

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ install(
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
8 changes: 8 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ at the top, as a last resort.
<alignment>64</alignment>
</arch>

<arch name="avx512dq">
<check name="avx512dq"></check>
<flag compiler="gnu">-mavx512dq</flag>
<flag compiler="clang">-mavx512dq</flag>
<flag compiler="msvc">/arch:AVX512DQ</flag>
<alignment>64</alignment>
</arch>

<arch name="riscv64">
</arch>

Expand Down
5 changes: 5 additions & 0 deletions gen/machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,9 @@
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx512dq">
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
</machine>

</grammar>
4 changes: 2 additions & 2 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* This file is intended to hold AVX2 FMA intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

Expand All @@ -23,7 +23,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
64 changes: 64 additions & 0 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX512 intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
return _mm512_permutex2var_ps(z1, idx, z2);
}

static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
return _mm512_permutex2var_ps(z1, idx, z2);
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
{
const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);

const __m512 x_times_x = _mm512_mul_ps(x, x);
__m512 arctan;
arctan = a13;
arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm512_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
4 changes: 2 additions & 2 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*/

/*
* This file is intended to hold AVX intrinsics of intrinsics.
* This file is intended to hold AVX intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/

Expand All @@ -24,7 +24,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
134 changes: 103 additions & 31 deletions kernels/volk/volk_32f_atan_32f.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2014 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
* Copyright 2023, 2024 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand All @@ -13,19 +13,19 @@
*
* \b Overview
*
* Computes arcsine of input vector and stores results in output vector.
* Computes arctan of input vector and stores results in output vector.
*
* <b>Dispatcher Prototype</b>
* \code
* void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
* void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points)
* \endcode
*
* \b Inputs
* \li aVector: The input vector of floats.
* \li in_ptr: The input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
* \li bVector: The vector where results will be stored.
* \li out_ptr: The vector where results will be stored.
*
* \b Example
* Calculate common angles around the top half of the unit circle.
Expand Down Expand Up @@ -59,6 +59,64 @@
#ifndef INCLUDED_volk_32f_atan_32f_a_H
#define INCLUDED_volk_32f_atan_32f_a_H

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
for (; number < num_points; number++) {
*out++ = atanf(*in++);
}
}
#endif /* LV_HAVE_GENERIC */

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_GENERIC */

#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
#include <immintrin.h>
#include <volk/volk_avx512_intrinsics.h>
static inline void
volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
{
const __m512 one = _mm512_set1_ps(1.f);
const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));

unsigned int number = 0;
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
unsigned int sixteenth_points = num_points / 16;
for (; number < sixteenth_points; number++) {
__m512 x = _mm512_load_ps(in);
__mmask16 swap_mask =
_mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
_mm512_mask_blend_ps(swap_mask, one, x));
__m512 result = _mm512_arctan_poly_avx512(x_star);
jdemel marked this conversation as resolved.
Show resolved Hide resolved
__m512 term = _mm512_and_ps(x_star, sign_mask);
term = _mm512_or_ps(pi_over_2, term);
term = _mm512_sub_ps(term, result);
result = _mm512_mask_blend_ps(swap_mask, result, term);
_mm512_store_ps(out, result);
in += 16;
out += 16;
}

number = sixteenth_points * 16;
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
for (; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_AVX512F for aligned */

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
Expand All @@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_poly_avx2_fma(x_star);
__m256 result = _mm256_arctan_poly_avx2_fma(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
Expand Down Expand Up @@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_poly_avx(x_star);
__m256 result = _mm256_arctan_poly_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
Expand Down Expand Up @@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
#ifndef INCLUDED_volk_32f_atan_32f_u_H
#define INCLUDED_volk_32f_atan_32f_u_H

#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
#include <immintrin.h>
#include <volk/volk_avx512_intrinsics.h>
static inline void
volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved
{
const __m512 one = _mm512_set1_ps(1.f);
const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));

unsigned int number = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move this into the for loop.

unsigned int sixteenth_points = num_points / 16;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be const. At least that would make reading the code easier.

for (; number < sixteenth_points; number++) {
__m512 x = _mm512_loadu_ps(in);
__mmask16 swap_mask =
_mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
_mm512_mask_blend_ps(swap_mask, one, x));
__m512 result = _mm512_arctan_poly_avx512(x_star);
jdemel marked this conversation as resolved.
Show resolved Hide resolved
__m512 term = _mm512_and_ps(x_star, sign_mask);
term = _mm512_or_ps(pi_over_2, term);
term = _mm512_sub_ps(term, result);
result = _mm512_mask_blend_ps(swap_mask, result, term);
_mm512_storeu_ps(out, result);
in += 16;
out += 16;
}

number = sixteenth_points * 16;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move this into the for loop.

for (; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_AVX512F for unaligned */
Ka-zam marked this conversation as resolved.
Show resolved Hide resolved

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
static inline void
Expand All @@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_poly_avx2_fma(x_star);
__m256 result = _mm256_arctan_poly_avx2_fma(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
Expand Down Expand Up @@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
__m256 result = _m256_arctan_poly_avx(x_star);
__m256 result = _mm256_arctan_poly_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
Expand Down Expand Up @@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
}
#endif /* LV_HAVE_SSE4_1 for unaligned */

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_GENERIC */

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
*out++ = atanf(*in++);
}
}
#endif /* LV_HAVE_GENERIC */

#endif /* INCLUDED_volk_32f_atan_32f_u_H */
Loading
Loading