From 2698dcb47ccfb7675ce0821447a7882ff095ca2c Mon Sep 17 00:00:00 2001
From: LHT129 <tianlan.lht@antgroup.com>
Date: Tue, 24 Dec 2024 14:45:24 +0800
Subject: [PATCH] fix arm compile error (#240)

- add arm test on ci

Signed-off-by: LHT129 <tianlan.lht@antgroup.com>
---
 .circleci/config.yml                | 45 +++++++++++++++++++++++++++++
 scripts/deps/install_deps_ubuntu.sh | 12 +++++++-
 src/simd/CMakeLists.txt             |  1 +
 src/simd/avx.cpp                    |  5 ++++
 src/simd/avx512.cpp                 | 20 ++++++++-----
 src/simd/fp32_simd_test.cpp         | 12 --------
 src/simd/normalize_test.cpp         | 12 --------
 src/simd/sq4_simd_test.cpp          | 13 ---------
 src/simd/sq4_uniform_simd_test.cpp  | 12 --------
 src/simd/sq8_simd.h                 |  2 --
 src/simd/sq8_simd_test.cpp          | 14 +--------
 src/simd/sq8_uniform_simd_test.cpp  | 12 --------
 src/simd/sse.cpp                    | 19 +++++++-----
 tests/CMakeLists.txt                |  6 ++--
 14 files changed, 90 insertions(+), 95 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a4c6c56b..3968d42b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -8,6 +8,11 @@ workflows:
             branches:
               ignore:
                 - main
+      - pull-request-check-aarch64:
+          filters:
+            branches:
+              ignore:
+                - main
   main-branch-workflow:
     jobs:
       - main-branch-check:
@@ -15,6 +20,11 @@ workflows:
             branches:
               only:
                 - main
+      - main-branch-check-aarch64:
+          filters:
+            branches:
+              only:
+                - main
 
 jobs:
   pull-request-check:
@@ -33,12 +43,46 @@ jobs:
             - ./build
       - run: make test_parallel
 
+  pull-request-check-aarch64:
+    docker:
+      - image: ubuntu:22.04
+    resource_class: arm.medium
+    steps:
+      - checkout
+      - run: bash scripts/deps/install_deps_ubuntu.sh
+      - restore_cache:
+          keys:
+            - fork-cache-arm-{{ checksum "CMakeLists.txt" }}-{{ checksum ".circleci/fresh_ci_cache.commit" }}
+      - run: make debug
+      - save_cache:
+          key: fork-cache-arm-{{ checksum "CMakeLists.txt" }}-{{ checksum ".circleci/fresh_ci_cache.commit" }}
+          paths:
+            - ./build
+      - run: make test_parallel
+
   main-branch-check:
     docker:
       - image: vsaglib/vsag:ubuntu
     resource_class: medium+
     steps:
       - checkout
+      - restore_cache:
+          keys:
+            - main-ccache-arm-{{ checksum "CMakeLists.txt" }}-{{ checksum ".circleci/fresh_ci_cache.commit" }}
+      - run: make debug
+      - save_cache:
+          key: main-ccache-arm-{{ checksum "CMakeLists.txt" }}-{{ checksum ".circleci/fresh_ci_cache.commit" }}
+          paths:
+            - ./build
+      - run: make test_parallel
+
+  main-branch-check-aarch64:
+    docker:
+      - image: ubuntu:22.04
+    resource_class: arm.medium
+    steps:
+      - checkout
+      - run: bash scripts/deps/install_deps_ubuntu.sh
       - restore_cache:
           keys:
             - main-ccache-{{ checksum "CMakeLists.txt" }}-{{ checksum ".circleci/fresh_ci_cache.commit" }}
@@ -48,3 +92,4 @@ jobs:
           paths:
             - ./build
       - run: make test_parallel
+
diff --git a/scripts/deps/install_deps_ubuntu.sh b/scripts/deps/install_deps_ubuntu.sh
index d452085a..859a59fb 100644
--- a/scripts/deps/install_deps_ubuntu.sh
+++ b/scripts/deps/install_deps_ubuntu.sh
@@ -1 +1,11 @@
-sudo apt install -y gfortran python3-dev libomp-15-dev lcov intel-mkl
+arch=$(uname -m)
+
+if [[ "$arch" == "x86_64" ]]; then
+    echo "Executing apt install for x86_64"
+    apt update && apt install -y gfortran python3-dev libomp-15-dev lcov intel-mkl gcc make cmake g++
+elif [[ "$arch" == "aarch64" ]]; then
+    echo "Executing apt install for aarch64"
+    apt update && apt install -y gfortran python3-dev libomp-15-dev gcc make cmake g++ lcov
+else
+    echo "Unknown architecture: $arch"
+fi
diff --git a/src/simd/CMakeLists.txt b/src/simd/CMakeLists.txt
index 68501f06..4322c1fc 100644
--- a/src/simd/CMakeLists.txt
+++ b/src/simd/CMakeLists.txt
@@ -9,6 +9,7 @@ set (SIMD_SRCS
         sse.cpp
         avx.cpp
         avx512.cpp
+        normalize.cpp
 )
 if (DIST_CONTAINS_SSE)
     set (SIMD_SRCS ${SIMD_SRCS} sse.cpp)
diff --git a/src/simd/avx.cpp b/src/simd/avx.cpp
index 486d0e1a..1ad908b8 100644
--- a/src/simd/avx.cpp
+++ b/src/simd/avx.cpp
@@ -13,7 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if defined(ENABLE_AVX)
 #include <immintrin.h>
+#endif
 
 #include <cmath>
 #include <cstdint>
@@ -30,6 +32,7 @@ namespace vsag {
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
 #define PORTABLE_ALIGN64 __attribute__((aligned(64)))
 
+#if defined(ENABLE_AVX)
 float
 L2SqrSIMD16ExtAVX(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
     float* pVect1 = (float*)pVect1v;
@@ -163,6 +166,8 @@ PQDistanceAVXFloat256(const void* single_dim_centers, float single_dim_val, void
     }
 }
 
+#endif
+
 namespace avx2 {
 
 #if defined(ENABLE_AVX2)
diff --git a/src/simd/avx512.cpp b/src/simd/avx512.cpp
index c1edce33..517e5ee8 100644
--- a/src/simd/avx512.cpp
+++ b/src/simd/avx512.cpp
@@ -12,8 +12,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#if defined(ENABLE_AVX512)
 #include <immintrin.h>
+#endif
 
 #include <cmath>
 
@@ -30,6 +31,7 @@ namespace vsag {
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
 #define PORTABLE_ALIGN64 __attribute__((aligned(64)))
 
+#if defined(ENABLE_AVX512)
 float
 L2SqrSIMD16ExtAVX512(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
     float* pVect1 = (float*)pVect1v;
@@ -202,6 +204,8 @@ INT8InnerProduct512ResidualsAVX512Distance(const void* pVect1v,
     return -INT8InnerProduct512ResidualsAVX512(pVect1v, pVect2v, qty_ptr);
 }
 
+#endif
+
 namespace avx512 {
 float
 FP32ComputeIP(const float* query, const float* codes, uint64_t dim) {
@@ -221,7 +225,7 @@ FP32ComputeIP(const float* query, const float* codes, uint64_t dim) {
     ip += avx2::FP32ComputeIP(query + n * 16, codes + n * 16, dim - n * 16);
     return ip;
 #else
-    return vsag::Generic::FP32ComputeIP(query, codes, dim);
+    return vsag::generic::FP32ComputeIP(query, codes, dim);
 #endif
 }
 
@@ -244,7 +248,7 @@ FP32ComputeL2Sqr(const float* query, const float* codes, uint64_t dim) {
     l2 += avx2::FP32ComputeL2Sqr(query + n * 16, codes + n * 16, dim - n * 16);
     return l2;
 #else
-    return vsag::Generic::FP32ComputeL2Sqr(query, codes, dim);
+    return vsag::generic::FP32ComputeL2Sqr(query, codes, dim);
 #endif
 }
 
@@ -282,7 +286,7 @@ SQ8ComputeIP(const float* query,
     finalResult += avx2::SQ8ComputeIP(query + i, codes + i, lowerBound + i, diff + i, dim - i);
     return finalResult;
 #else
-    return Generic::SQ8ComputeIP(query, codes, lowerBound, diff, dim);
+    return generic::SQ8ComputeIP(query, codes, lowerBound, diff, dim);
 #endif
 }
 
@@ -320,7 +324,7 @@ SQ8ComputeL2Sqr(const float* query,
     result += avx2::SQ8ComputeL2Sqr(query + i, codes + i, lowerBound + i, diff + i, dim - i);
     return result;
 #else
-    return Generic::SQ8ComputeL2Sqr(query, codes, lowerBound, diff, dim);
+    return generic::SQ8ComputeL2Sqr(query, codes, lowerBound, diff, dim);
 #endif
 }
 
@@ -357,7 +361,7 @@ SQ8ComputeCodesIP(const uint8_t* codes1,
     result += avx2::SQ8ComputeCodesIP(codes1 + i, codes2 + i, lowerBound + i, diff + i, dim - i);
     return result;
 #else
-    return Generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
+    return generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
 #endif
 }
 
@@ -390,7 +394,7 @@ SQ8ComputeCodesL2Sqr(const uint8_t* codes1,
     result += avx2::SQ8ComputeCodesL2Sqr(codes1 + i, codes2 + i, lowerBound + i, diff + i, dim - i);
     return result;
 #else
-    return Generic::SQ8ComputeL2Sqr(query, codes, lowerBound, diff, dim);
+    return generic::SQ8ComputeCodesL2Sqr(codes1, codes2, lowerBound, diff, dim);
 #endif
 }
 
@@ -494,7 +498,7 @@ SQ8UniformComputeCodesIP(const uint8_t* codes1, const uint8_t* codes2, uint64_t
     result += static_cast<int32_t>(avx2::SQ8UniformComputeCodesIP(codes1 + d, codes2 + d, dim - d));
     return static_cast<float>(result);
 #else
-    return avx2::S8UniformComputeCodesIP(codes1, codes2, dim);
+    return avx2::SQ8UniformComputeCodesIP(codes1, codes2, dim);
 #endif
 }
 
diff --git a/src/simd/fp32_simd_test.cpp b/src/simd/fp32_simd_test.cpp
index 36a41da4..50a1d21f 100644
--- a/src/simd/fp32_simd_test.cpp
+++ b/src/simd/fp32_simd_test.cpp
@@ -22,18 +22,6 @@
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 #define TEST_ACCURACY(Func)                                                           \
     {                                                                                 \
         float gt, sse, avx2, avx512;                                                  \
diff --git a/src/simd/normalize_test.cpp b/src/simd/normalize_test.cpp
index a84ec2a2..04f39f3b 100644
--- a/src/simd/normalize_test.cpp
+++ b/src/simd/normalize_test.cpp
@@ -22,18 +22,6 @@
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 TEST_CASE("Normalize SIMD Compute", "[simd]") {
     auto dims = fixtures::get_common_used_dims();
     int64_t count = 100;
diff --git a/src/simd/sq4_simd_test.cpp b/src/simd/sq4_simd_test.cpp
index 75bdfe46..322ec40a 100644
--- a/src/simd/sq4_simd_test.cpp
+++ b/src/simd/sq4_simd_test.cpp
@@ -17,24 +17,11 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "../logger.h"
 #include "catch2/benchmark/catch_benchmark.hpp"
 #include "fixtures.h"
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 #define TEST_ACCURACY(Func)                                        \
     {                                                              \
         auto gt = generic::Func(codes1.data() + i * code_size,     \
diff --git a/src/simd/sq4_uniform_simd_test.cpp b/src/simd/sq4_uniform_simd_test.cpp
index 73e37660..8ea7f30f 100644
--- a/src/simd/sq4_uniform_simd_test.cpp
+++ b/src/simd/sq4_uniform_simd_test.cpp
@@ -23,18 +23,6 @@
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 #define TEST_ACCURACY(Func)                                                                      \
     {                                                                                            \
         auto gt =                                                                                \
diff --git a/src/simd/sq8_simd.h b/src/simd/sq8_simd.h
index e6c58858..885665f1 100644
--- a/src/simd/sq8_simd.h
+++ b/src/simd/sq8_simd.h
@@ -45,7 +45,6 @@ SQ8ComputeCodesL2Sqr(const uint8_t* codes1,
                      uint64_t dim);
 }  // namespace generic
 
-#if defined(ENABLE_SSE)
 namespace sse {
 float
 SQ8ComputeIP(const float* query,
@@ -72,7 +71,6 @@ SQ8ComputeCodesL2Sqr(const uint8_t* codes1,
                      const float* diff,
                      uint64_t dim);
 }  // namespace sse
-#endif
 
 namespace avx2 {
 float
diff --git a/src/simd/sq8_simd_test.cpp b/src/simd/sq8_simd_test.cpp
index 82aa9baa..06a68b95 100644
--- a/src/simd/sq8_simd_test.cpp
+++ b/src/simd/sq8_simd_test.cpp
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "simd/sq8_simd.h"
+#include "sq8_simd.h"
 
 #include "catch2/benchmark/catch_benchmark.hpp"
 #include "catch2/catch_test_macros.hpp"
@@ -22,18 +22,6 @@
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 #define TEST_ACCURACY(Func)                                                                 \
     {                                                                                       \
         auto gt = generic::Func(                                                            \
diff --git a/src/simd/sq8_uniform_simd_test.cpp b/src/simd/sq8_uniform_simd_test.cpp
index 1ccd41c0..b64b8c25 100644
--- a/src/simd/sq8_uniform_simd_test.cpp
+++ b/src/simd/sq8_uniform_simd_test.cpp
@@ -23,18 +23,6 @@
 
 using namespace vsag;
 
-#ifndef ENABLE_SSE
-namespace sse = generic;
-#endif
-
-#ifndef ENABLE_AVX2
-namespace avx2 = sse;
-#endif
-
-#ifndef ENABLE_AVX512
-namespace avx512 = avx2;
-#endif
-
 #define TEST_ACCURACY(Func)                                                                      \
     {                                                                                            \
         auto gt =                                                                                \
diff --git a/src/simd/sse.cpp b/src/simd/sse.cpp
index 2c57ee12..078f19a5 100644
--- a/src/simd/sse.cpp
+++ b/src/simd/sse.cpp
@@ -13,7 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if defined(ENABLE_SSE)
 #include <x86intrin.h>
+#endif
 
 #include <cmath>
 
@@ -35,6 +37,7 @@ L2Sqr(const void* pVect1v, const void* pVect2v, const void* qty_ptr);
 extern float
 InnerProduct(const void* pVect1, const void* pVect2, const void* qty_ptr);
 
+#if defined(ENABLE_SSE)
 /* L2 Distance */
 float
 L2SqrSIMD4ExtSSE(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
@@ -302,6 +305,8 @@ PQDistanceSSEFloat256(const void* single_dim_centers, float single_dim_val, void
     }
 }
 
+#endif
+
 namespace sse {
 
 #if defined(ENABLE_SSE)
@@ -333,7 +338,7 @@ FP32ComputeIP(const float* query, const float* codes, uint64_t dim) {
     ip += generic::FP32ComputeIP(query + n * 4, codes + n * 4, dim - n * 4);
     return ip;
 #else
-    return vsag::Generic::FP32ComputeIP(query, codes, dim);
+    return vsag::generic::FP32ComputeIP(query, codes, dim);
 #endif
 }
 
@@ -357,7 +362,7 @@ FP32ComputeL2Sqr(const float* query, const float* codes, uint64_t dim) {
     l2 += generic::FP32ComputeL2Sqr(query + n * 4, codes + n * 4, dim - n * 4);
     return l2;
 #else
-    return vsag::Generic::FP32ComputeL2Sqr(query, codes, dim);
+    return vsag::generic::FP32ComputeL2Sqr(query, codes, dim);
 #endif
 }
 
@@ -399,7 +404,7 @@ SQ8ComputeIP(const float* query,
     return result[0] +
            generic::SQ8ComputeIP(query + i, codes + i, lowerBound + i, diff + i, dim - i);
 #else
-    return Generic::SQ8ComputeIP(query, codes, lowerBound, diff, dim);
+    return generic::SQ8ComputeIP(query, codes, lowerBound, diff, dim);
 #endif
 }
 
@@ -441,7 +446,7 @@ SQ8ComputeL2Sqr(const float* query,
 
     return result;
 #else
-    return Generic::SQ8ComputeL2Sqr(query, codes, lowerBound, diff, dim);
+    return generic::SQ8ComputeL2Sqr(query, codes, lowerBound, diff, dim);
 #endif
 }
 
@@ -481,7 +486,7 @@ SQ8ComputeCodesIP(const uint8_t* codes1,
     result += generic::SQ8ComputeCodesIP(codes1 + i, codes2 + i, lowerBound + i, diff + i, dim - i);
     return result;
 #else
-    return Generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
+    return generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
 #endif
 }
 
@@ -523,7 +528,7 @@ SQ8ComputeCodesL2Sqr(const uint8_t* codes1,
         generic::SQ8ComputeCodesL2Sqr(codes1 + i, codes2 + i, lowerBound + i, diff + i, dim - i);
     return result;
 #else
-    return Generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
+    return generic::SQ8ComputeCodesIP(codes1, codes2, lowerBound, diff, dim);
 #endif
 }
 
@@ -627,7 +632,7 @@ SQ8UniformComputeCodesIP(const uint8_t* codes1, const uint8_t* codes2, uint64_t
         static_cast<int32_t>(generic::SQ8UniformComputeCodesIP(codes1 + d, codes2 + d, dim - d));
     return static_cast<float>(result);
 #else
-    return generic::S8UniformComputeCodesIP(codes1, codes2, dim);
+    return generic::SQ8UniformComputeCodesIP(codes1, codes2, dim);
 #endif
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 72212c38..2f399289 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,9 +10,9 @@ endif ()
 if (DIST_CONTAINS_AVX)
     target_compile_definitions (unittests PRIVATE ENABLE_AVX=1)
 endif ()
-if (DIST_CONTAINS_AVX2)
-    target_compile_definitions (unittests PRIVATE ENABLE_AVX2=1)
-endif ()
+#if (DIST_CONTAINS_AVX2)
+#    target_compile_definitions (unittests PRIVATE ENABLE_AVX2=1)
+#endif ()
 if (DIST_CONTAINS_AVX512)
     target_compile_definitions (unittests PRIVATE ENABLE_AVX512=1)
 endif ()