diff --git a/CMakeLists.txt b/CMakeLists.txt index d6401845..823f0942 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,6 +255,7 @@ install( ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb4..792c50d1 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -178,6 +178,14 @@ at the top, as a last resort. 64 + + + -mavx512dq + -mavx512dq + /arch:AVX512DQ + 64 + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f9794..b76f6d07 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -65,4 +65,9 @@ generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc| + + +generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc| + + diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index 03b24e6c..8a7e4d63 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -8,7 +8,7 @@ */ /* - * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * This file is intended to hold AVX2 FMA intrinsics. * They should be used in VOLK kernels to avoid copy-paste. */ @@ -23,7 +23,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h new file mode 100644 index 00000000..6f6a05ee --- /dev/null +++ b/include/volk/volk_avx512_intrinsics.h @@ -0,0 +1,71 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX512 intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#include + +//////////////////////////////////////////////////////////////////////// +// Place real parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// +static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) +{ + const __m512i idx = + _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + return _mm512_permutex2var_ps(z1, idx, z2); +} + +//////////////////////////////////////////////////////////////////////// +// Place imaginary parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// +static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) +{ + const __m512i idx = + _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + return _mm512_permutex2var_ps(z1, idx, z2); +} + +//////////////////////////////////////////////////////////////////////// +// Approximate arctan(x) via polynomial expansion on the interval [-1, 1] +// Maximum relative error ~6.5e-7 +// Polynomial evaluated via Horner's method +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// +static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) +{ + const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); + const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f); + const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f); + const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f); + const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f); + const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f); + const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f); + + const __m512 x_times_x = _mm512_mul_ps(x, x); + __m512 arctan; + arctan = a13; + arctan = _mm512_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm512_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 2fc0f064..c6c7a2c5 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -9,7 +9,7 @@ */ /* - * This file is intended to hold AVX intrinsics of intrinsics. + * This file is intended to hold AVX intrinsics. * They should be used in VOLK kernels to avoid copy-pasta. */ @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb..03afea55 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -13,19 +13,19 @@ * * \b Overview * - * Computes arcsine of input vector and stores results in output vector. + * Computes arctan of input vector and stores results in output vector. * * Dispatcher Prototype * \code - * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points) + * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points) * \endcode * * \b Inputs - * \li aVector: The input vector of floats. + * \li in_ptr: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs - * \li bVector: The vector where results will be stored. + * \li out_ptr: The vector where results will be stored. * * \b Example * Calculate common angles around the top half of the unit circle. @@ -59,6 +59,61 @@ #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + for (unsigned int number = 0; number < num_points; number++) { + *out++ = atanf(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) +{ + for (unsigned int number = 0; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + in += 16; + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_store_ps(out, result); + out += 16; + } + + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -70,25 +125,24 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -105,25 +159,24 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -140,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -153,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_store_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -168,6 +220,41 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #ifndef INCLUDED_volk_32f_atan_32f_u_H #define INCLUDED_volk_32f_atan_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + in += 16; + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_storeu_ps(out, result); + out += 16; + } + + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include static inline void @@ -178,25 +265,24 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -212,25 +298,24 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -247,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -260,37 +346,13 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_storeu_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } #endif /* LV_HAVE_SSE4_1 for unaligned */ -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = volk_arctan(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = atanf(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24c..5e8be5ce 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2012, 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -72,8 +72,8 @@ static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float real = *inPtr++; const float imag = *inPtr++; *outPtr++ = atan2f(imag, real) * invNormalizeFactor; @@ -91,8 +91,8 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float x = *inPtr++; const float y = *inPtr++; *outPtr++ = volk_atan2(y, x) * invNormalizeFactor; @@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + const unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_load_ps(in); + in += 16; + __m512 z2 = _mm512_load_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_store_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, complexVector + number, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -120,7 +180,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -180,7 +240,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 z1 = _mm512_loadu_ps(in); + in += 16; + __m512 z2 = _mm512_loadu_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_storeu_ps(out, result); + out += 16; + } + + unsigned int number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, complexVector + number, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -244,7 +364,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; @@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -304,7 +424,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; @@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);