gnuradio · Ka-zam · Feb 25, 2024 · Feb 25, 2024 · Feb 25, 2024 · Feb 29, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -255,6 +255,7 @@ install(
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
+          ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h

diff --git a/gen/archs.xml b/gen/archs.xml
@@ -178,6 +178,14 @@ at the top, as a last resort.
     <alignment>64</alignment>
 </arch>
 
+<arch name="avx512dq">
+    <check name="avx512dq"></check>
+    <flag compiler="gnu">-mavx512dq</flag>
+    <flag compiler="clang">-mavx512dq</flag>
+    <flag compiler="msvc">/arch:AVX512DQ</flag>
+    <alignment>64</alignment>
+</arch>
+
 <arch name="riscv64">
 </arch>
 

diff --git a/gen/machines.xml b/gen/machines.xml
@@ -65,4 +65,9 @@
 <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
 </machine>
 
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx512dq">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
+</machine>
+
 </grammar>
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
@@ -8,7 +8,7 @@
  */
 
 /*
- * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * This file is intended to hold AVX2 FMA intrinsics.
  * They should be used in VOLK kernels to avoid copy-paste.
  */
 
@@ -23,7 +23,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);

diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -0,0 +1,64 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Magnus Lundmark <[email protected]>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX512 intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#include <immintrin.h>
+
+static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
+{
+    const __m512i idx =
+        _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    return _mm512_permutex2var_ps(z1, idx, z2);
+}
+
+static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
+{
+    const __m512i idx =
+        _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    return _mm512_permutex2var_ps(z1, idx, z2);
+}
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
+{
+    const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
+    const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
+    const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
+    const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
+    const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
+    const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
+    const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);
+
+    const __m512 x_times_x = _mm512_mul_ps(x, x);
+    __m512 arctan;
+    arctan = a13;
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm512_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
@@ -9,7 +9,7 @@
  */
 
 /*
- * This file is intended to hold AVX intrinsics of intrinsics.
+ * This file is intended to hold AVX intrinsics.
  * They should be used in VOLK kernels to avoid copy-pasta.
  */
 
@@ -24,7 +24,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_poly_avx(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);

diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <[email protected]>
+ * Copyright 2023, 2024 Magnus Lundmark <[email protected]>
  *
  * This file is part of VOLK
  *
@@ -13,19 +13,19 @@
  *
  * \b Overview
  *
- * Computes arcsine of input vector and stores results in output vector.
+ * Computes arctan of input vector and stores results in output vector.
  *
  * <b>Dispatcher Prototype</b>
  * \code
- * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points)
  * \endcode
  *
  * \b Inputs
- * \li aVector: The input vector of floats.
+ * \li in_ptr: The input vector of floats.
  * \li num_points: The number of data points.
  *
  * \b Outputs
- * \li bVector: The vector where results will be stored.
+ * \li out_ptr: The vector where results will be stored.
  *
  * \b Example
  * Calculate common angles around the top half of the unit circle.
@@ -59,6 +59,64 @@
 #ifndef INCLUDED_volk_32f_atan_32f_a_H
 #define INCLUDED_volk_32f_atan_32f_a_H
 
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = atanf(*in++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void
+volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const __m512 one = _mm512_set1_ps(1.f);
+    const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 x = _mm512_load_ps(in);
+        __mmask16 swap_mask =
+            _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+                                      _mm512_mask_blend_ps(swap_mask, one, x));
+        __m512 result = _mm512_arctan_poly_avx512(x_star);
+        __m512 term = _mm512_and_ps(x_star, sign_mask);
+        term = _mm512_or_ps(pi_over_2, term);
+        term = _mm512_sub_ps(term, result);
+        result = _mm512_mask_blend_ps(swap_mask, result, term);
+        _mm512_store_ps(out, result);
+        in += 16;
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_AVX512F for aligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
@@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+        __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx(x_star);
+        __m256 result = _mm256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 #ifndef INCLUDED_volk_32f_atan_32f_u_H
 #define INCLUDED_volk_32f_atan_32f_u_H
 
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void
+volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const __m512 one = _mm512_set1_ps(1.f);
+    const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 x = _mm512_loadu_ps(in);
+        __mmask16 swap_mask =
+            _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+                                      _mm512_mask_blend_ps(swap_mask, one, x));
+        __m512 result = _mm512_arctan_poly_avx512(x_star);
+        __m512 term = _mm512_and_ps(x_star, sign_mask);
+        term = _mm512_or_ps(pi_over_2, term);
+        term = _mm512_sub_ps(term, result);
+        result = _mm512_mask_blend_ps(swap_mask, result, term);
+        _mm512_storeu_ps(out, result);
+        in += 16;
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_AVX512F for unaligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 static inline void
@@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+        __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx(x_star);
+        __m256 result = _mm256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
 }
 #endif /* LV_HAVE_SSE4_1 for unaligned */
 
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
-        *out++ = volk_arctan(*in++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
-        *out++ = atanf(*in++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
 #endif /* INCLUDED_volk_32f_atan_32f_u_H */