diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6401845..823f0942 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -255,6 +255,7 @@ install(
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
+ ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
diff --git a/gen/archs.xml b/gen/archs.xml
index 164c7bb4..792c50d1 100644
--- a/gen/archs.xml
+++ b/gen/archs.xml
@@ -178,6 +178,14 @@ at the top, as a last resort.
64
+
+
+ -mavx512dq
+ -mavx512dq
+ /arch:AVX512DQ
+ 64
+
+
diff --git a/gen/machines.xml b/gen/machines.xml
index 887f9794..b76f6d07 100644
--- a/gen/machines.xml
+++ b/gen/machines.xml
@@ -65,4 +65,9 @@
generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|
+
+
+generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|
+
+
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
index 03b24e6c..8a7e4d63 100644
--- a/include/volk/volk_avx2_fma_intrinsics.h
+++ b/include/volk/volk_avx2_fma_intrinsics.h
@@ -8,7 +8,7 @@
*/
/*
- * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * This file is intended to hold AVX2 FMA intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/
@@ -23,7 +23,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
-static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
new file mode 100644
index 00000000..6f6a05ee
--- /dev/null
+++ b/include/volk/volk_avx512_intrinsics.h
@@ -0,0 +1,71 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Magnus Lundmark
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX512 intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#include
+
+////////////////////////////////////////////////////////////////////////
+// Place real parts of two complex vectors in output
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
+{
+ const __m512i idx =
+ _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+ return _mm512_permutex2var_ps(z1, idx, z2);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Place imaginary parts of two complex vectors in output
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
+{
+ const __m512i idx =
+ _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+ return _mm512_permutex2var_ps(z1, idx, z2);
+}
+
+////////////////////////////////////////////////////////////////////////
+// Approximate arctan(x) via polynomial expansion on the interval [-1, 1]
+// Maximum relative error ~6.5e-7
+// Polynomial evaluated via Horner's method
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
+static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
+{
+ const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
+ const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
+ const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
+ const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
+ const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
+ const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
+ const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);
+
+ const __m512 x_times_x = _mm512_mul_ps(x, x);
+ __m512 arctan;
+ arctan = a13;
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
+ arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
+ arctan = _mm512_mul_ps(x, arctan);
+
+ return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
index 2fc0f064..c6c7a2c5 100644
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -9,7 +9,7 @@
*/
/*
- * This file is intended to hold AVX intrinsics of intrinsics.
+ * This file is intended to hold AVX intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/
@@ -24,7 +24,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
-static inline __m256 _m256_arctan_poly_avx(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index dc5987cb..03afea55 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2014 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark
+ * Copyright 2023, 2024 Magnus Lundmark
*
* This file is part of VOLK
*
@@ -13,19 +13,19 @@
*
* \b Overview
*
- * Computes arcsine of input vector and stores results in output vector.
+ * Computes arctan of input vector and stores results in output vector.
*
* Dispatcher Prototype
* \code
- * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points)
* \endcode
*
* \b Inputs
- * \li aVector: The input vector of floats.
+ * \li in_ptr: The input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
- * \li bVector: The vector where results will be stored.
+ * \li out_ptr: The vector where results will be stored.
*
* \b Example
* Calculate common angles around the top half of the unit circle.
@@ -59,6 +59,61 @@
#ifndef INCLUDED_volk_32f_atan_32f_a_H
#define INCLUDED_volk_32f_atan_32f_a_H
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+ for (unsigned int number = 0; number < num_points; number++) {
+ *out++ = atanf(*in++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
+{
+ for (unsigned int number = 0; number < num_points; number++) {
+ *out++ = volk_arctan(*in++);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include
+#include
+static inline void
+volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points)
+{
+ const __m512 one = _mm512_set1_ps(1.f);
+ const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+ const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+ const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+ const unsigned int sixteenth_points = num_points / 16;
+
+ for (unsigned int number = 0; number < sixteenth_points; number++) {
+ __m512 x = _mm512_load_ps(in);
+ in += 16;
+ __mmask16 swap_mask =
+ _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+ __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+ _mm512_mask_blend_ps(swap_mask, one, x));
+ __m512 result = _mm512_arctan_poly_avx512(x_star);
+ __m512 term = _mm512_and_ps(x_star, sign_mask);
+ term = _mm512_or_ps(pi_over_2, term);
+ term = _mm512_sub_ps(term, result);
+ result = _mm512_mask_blend_ps(swap_mask, result, term);
+ _mm512_store_ps(out, result);
+ out += 16;
+ }
+
+ for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
+ *out++ = volk_arctan(*in++);
+ }
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
+
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include
#include
@@ -70,25 +125,24 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
- for (; number < eighth_points; number++) {
+ const unsigned int eighth_points = num_points / 8;
+
+ for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
+ in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
- __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+ __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_store_ps(out, result);
- in += 8;
out += 8;
}
- number = eighth_points * 8;
- for (; number < num_points; number++) {
+ for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
@@ -105,25 +159,24 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
- for (; number < eighth_points; number++) {
+ const unsigned int eighth_points = num_points / 8;
+
+ for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
+ in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
- __m256 result = _m256_arctan_poly_avx(x_star);
+ __m256 result = _mm256_arctan_poly_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_store_ps(out, result);
- in += 8;
out += 8;
}
- number = eighth_points * 8;
- for (; number < num_points; number++) {
+ for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
@@ -140,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- for (; number < quarter_points; number++) {
+ const unsigned int quarter_points = num_points / 4;
+
+ for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_load_ps(in);
+ in += 4;
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
_mm_blendv_ps(one, x, swap_mask));
@@ -153,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
term = _mm_sub_ps(term, result);
result = _mm_blendv_ps(result, term, swap_mask);
_mm_store_ps(out, result);
- in += 4;
out += 4;
}
- number = quarter_points * 4;
- for (; number < num_points; number++) {
+ for (unsigned int number = quarter_points * 4; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
@@ -168,6 +220,41 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
#ifndef INCLUDED_volk_32f_atan_32f_u_H
#define INCLUDED_volk_32f_atan_32f_u_H
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include
+#include
+static inline void
+volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points)
+{
+ const __m512 one = _mm512_set1_ps(1.f);
+ const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+ const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+ const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+ const unsigned int sixteenth_points = num_points / 16;
+
+ for (unsigned int number = 0; number < sixteenth_points; number++) {
+ __m512 x = _mm512_loadu_ps(in);
+ in += 16;
+ __mmask16 swap_mask =
+ _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+ __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+ _mm512_mask_blend_ps(swap_mask, one, x));
+ __m512 result = _mm512_arctan_poly_avx512(x_star);
+ __m512 term = _mm512_and_ps(x_star, sign_mask);
+ term = _mm512_or_ps(pi_over_2, term);
+ term = _mm512_sub_ps(term, result);
+ result = _mm512_mask_blend_ps(swap_mask, result, term);
+ _mm512_storeu_ps(out, result);
+ out += 16;
+ }
+
+ for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
+ *out++ = volk_arctan(*in++);
+ }
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
+
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include
static inline void
@@ -178,25 +265,24 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
- for (; number < eighth_points; number++) {
+ const unsigned int eighth_points = num_points / 8;
+
+ for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
+ in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
- __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+ __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_storeu_ps(out, result);
- in += 8;
out += 8;
}
- number = eighth_points * 8;
- for (; number < num_points; number++) {
+ for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
@@ -212,25 +298,24 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
- for (; number < eighth_points; number++) {
+ const unsigned int eighth_points = num_points / 8;
+
+ for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
+ in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
- __m256 result = _m256_arctan_poly_avx(x_star);
+ __m256 result = _mm256_arctan_poly_avx(x_star);
__m256 term = _mm256_and_ps(x_star, sign_mask);
term = _mm256_or_ps(pi_over_2, term);
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_storeu_ps(out, result);
- in += 8;
out += 8;
}
- number = eighth_points * 8;
- for (; number < num_points; number++) {
+ for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
@@ -247,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
- unsigned int number = 0;
- unsigned int quarter_points = num_points / 4;
- for (; number < quarter_points; number++) {
+ const unsigned int quarter_points = num_points / 4;
+
+ for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_loadu_ps(in);
+ in += 4;
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
_mm_blendv_ps(one, x, swap_mask));
@@ -260,37 +346,13 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
term = _mm_sub_ps(term, result);
result = _mm_blendv_ps(result, term, swap_mask);
_mm_storeu_ps(out, result);
- in += 4;
out += 4;
}
- number = quarter_points * 4;
- for (; number < num_points; number++) {
+ for (unsigned int number = quarter_points * 4; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
-{
- unsigned int number = 0;
- for (; number < num_points; number++) {
- *out++ = volk_arctan(*in++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
-{
- unsigned int number = 0;
- for (; number < num_points; number++) {
- *out++ = atanf(*in++);
- }
-}
-#endif /* LV_HAVE_GENERIC */
-
#endif /* INCLUDED_volk_32f_atan_32f_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h
index 759db24c..5e8be5ce 100644
--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h
+++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2012, 2014 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark
+ * Copyright 2023, 2024 Magnus Lundmark
*
* This file is part of VOLK
*
@@ -72,8 +72,8 @@ static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
float* outPtr = outputVector;
const float* inPtr = (float*)inputVector;
const float invNormalizeFactor = 1.f / normalizeFactor;
- unsigned int number = 0;
- for (; number < num_points; number++) {
+
+ for (unsigned int number = 0; number < num_points; number++) {
const float real = *inPtr++;
const float imag = *inPtr++;
*outPtr++ = atan2f(imag, real) * invNormalizeFactor;
@@ -91,8 +91,8 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
float* outPtr = outputVector;
const float* inPtr = (float*)inputVector;
const float invNormalizeFactor = 1.f / normalizeFactor;
- unsigned int number = 0;
- for (; number < num_points; number++) {
+
+ for (unsigned int number = 0; number < num_points; number++) {
const float x = *inPtr++;
const float y = *inPtr++;
*outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
@@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
}
#endif /* LV_HAVE_GENERIC */
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include
+#include
+static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector,
+ const lv_32fc_t* complexVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ const float* in = (float*)complexVector;
+ float* out = (float*)outputVector;
+
+ const float invNormalizeFactor = 1.f / normalizeFactor;
+ const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
+ const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
+ const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
+ const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+ const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+ const __m512 zero = _mm512_setzero_ps();
+
+ unsigned int number = 0;
+ const unsigned int sixteenth_points = num_points / 16;
+ for (; number < sixteenth_points; number++) {
+ __m512 z1 = _mm512_load_ps(in);
+ in += 16;
+ __m512 z2 = _mm512_load_ps(in);
+ in += 16;
+
+ __m512 x = _mm512_real(z1, z2);
+ __m512 y = _mm512_imag(z1, z2);
+
+ __mmask16 swap_mask = _mm512_cmp_ps_mask(
+ _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
+ __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x),
+ _mm512_mask_blend_ps(swap_mask, x, y));
+ __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q);
+ input = _mm512_mask_blend_ps(nan_mask, input, zero);
+ __m512 result = _mm512_arctan_poly_avx512(input);
+
+ input =
+ _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
+ result = _mm512_mask_blend_ps(swap_mask, result, input);
+
+ __m512 x_sign_mask =
+ _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
+
+ result = _mm512_add_ps(
+ _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
+ result);
+ result = _mm512_mul_ps(result, vinvNormalizeFactor);
+
+ _mm512_store_ps(out, result);
+ out += 16;
+ }
+
+ number = sixteenth_points * 16;
+ volk_32fc_s32f_atan2_32f_polynomial(
+ out, complexVector + number, normalizeFactor, num_points - number);
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
+
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include
#include
@@ -120,7 +180,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
const __m256 zero = _mm256_setzero_ps();
unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
+ const unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 z1 = _mm256_load_ps(in);
in += 8;
@@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
_mm256_blendv_ps(x, y, swap_mask));
__m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
input = _mm256_blendv_ps(input, zero, nan_mask);
- __m256 result = _m256_arctan_poly_avx2_fma(input);
+ __m256 result = _mm256_arctan_poly_avx2_fma(input);
input =
_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -180,7 +240,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
const __m256 zero = _mm256_setzero_ps();
unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
+ const unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 z1 = _mm256_load_ps(in);
in += 8;
@@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
_mm256_blendv_ps(x, y, swap_mask));
__m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
input = _mm256_blendv_ps(input, zero, nan_mask);
- __m256 result = _m256_arctan_poly_avx(input);
+ __m256 result = _mm256_arctan_poly_avx(input);
input =
_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include
+#include
+static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector,
+ const lv_32fc_t* complexVector,
+ const float normalizeFactor,
+ unsigned int num_points)
+{
+ const float* in = (float*)complexVector;
+ float* out = (float*)outputVector;
+
+ const float invNormalizeFactor = 1.f / normalizeFactor;
+ const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
+ const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
+ const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
+ const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+ const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+ const __m512 zero = _mm512_setzero_ps();
+
+ const unsigned int sixteenth_points = num_points / 16;
+
+ for (unsigned int number = 0; number < sixteenth_points; number++) {
+ __m512 z1 = _mm512_loadu_ps(in);
+ in += 16;
+ __m512 z2 = _mm512_loadu_ps(in);
+ in += 16;
+
+ __m512 x = _mm512_real(z1, z2);
+ __m512 y = _mm512_imag(z1, z2);
+
+ __mmask16 swap_mask = _mm512_cmp_ps_mask(
+ _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
+ __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x),
+ _mm512_mask_blend_ps(swap_mask, x, y));
+ __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q);
+ input = _mm512_mask_blend_ps(nan_mask, input, zero);
+ __m512 result = _mm512_arctan_poly_avx512(input);
+
+ input =
+ _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
+ result = _mm512_mask_blend_ps(swap_mask, result, input);
+
+ __m512 x_sign_mask =
+ _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
+
+ result = _mm512_add_ps(
+ _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
+ result);
+ result = _mm512_mul_ps(result, vinvNormalizeFactor);
+
+ _mm512_storeu_ps(out, result);
+ out += 16;
+ }
+
+ unsigned int number = sixteenth_points * 16;
+ volk_32fc_s32f_atan2_32f_polynomial(
+ out, complexVector + number, normalizeFactor, num_points - number);
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
+
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include
#include
@@ -244,7 +364,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
const __m256 zero = _mm256_setzero_ps();
unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
+ const unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 z1 = _mm256_loadu_ps(in);
in += 8;
@@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
_mm256_blendv_ps(x, y, swap_mask));
__m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
input = _mm256_blendv_ps(input, zero, nan_mask);
- __m256 result = _m256_arctan_poly_avx2_fma(input);
+ __m256 result = _mm256_arctan_poly_avx2_fma(input);
input =
_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -304,7 +424,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
const __m256 zero = _mm256_setzero_ps();
unsigned int number = 0;
- unsigned int eighth_points = num_points / 8;
+ const unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 z1 = _mm256_loadu_ps(in);
in += 8;
@@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
_mm256_blendv_ps(x, y, swap_mask));
__m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
input = _mm256_blendv_ps(input, zero, nan_mask);
- __m256 result = _m256_arctan_poly_avx(input);
+ __m256 result = _mm256_arctan_poly_avx(input);
input =
_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);