Merge remote-tracking branch 'upstream/branch-23.12' into nightly_ver…

…sions
rapidsai · Oct 20, 2023 · 532c3e7 · 532c3e7
2 parents cfa4e36 + 945355d
commit 532c3e7
Show file tree

Hide file tree

Showing 18 changed files with 242 additions and 61 deletions.
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
@@ -36,6 +36,7 @@ trap "EXITCODE=1" ERR
 set +e
 
 # Run libraft gtests from libraft-tests package
+cd "$CONDA_PREFIX"/bin/gtests/libraft
 ctest -j8 --output-on-failure
 
 rapids-logger "Test script exiting with value: $EXITCODE"

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -0,0 +1,62 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- breathe
+- c-compiler
+- clang-tools=16.0.6
+- clang==16.0.6
+- cmake>=3.26.4
+- cuda-profiler-api=11.8.86
+- cuda-python>=11.7.1,<12.0a0
+- cuda-version=11.8
+- cudatoolkit
+- cupy>=12.0.0
+- cxx-compiler
+- cython>=3.0.0
+- dask-core==2023.9.2
+- dask-cuda==23.12.*
+- dask==2023.9.2
+- distributed==2023.9.2
+- doxygen>=1.8.20
+- gcc_linux-aarch64=11.*
+- gmock>=1.13.0
+- graphviz
+- gtest>=1.13.0
+- ipython
+- joblib>=0.11
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- nccl>=2.9.9
+- ninja
+- numba>=0.57
+- numpy>=1.21
+- numpydoc
+- nvcc_linux-aarch64=11.8
+- pre-commit
+- pydata-sphinx-theme
+- pytest
+- pytest-cov
+- recommonmark
+- rmm==23.12.*
+- scikit-build>=0.13.1
+- scikit-learn
+- scipy
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sysroot_linux-aarch64==2.17
+- ucx-proc=*=gpu
+- ucx-py==0.35.*
+- ucx>=1.13.0
+name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -0,0 +1,58 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- breathe
+- c-compiler
+- clang-tools=16.0.6
+- clang==16.0.6
+- cmake>=3.26.4
+- cuda-cudart-dev
+- cuda-nvcc
+- cuda-profiler-api
+- cuda-python>=12.0,<13.0a0
+- cuda-version=12.0
+- cupy>=12.0.0
+- cxx-compiler
+- cython>=3.0.0
+- dask-core==2023.9.2
+- dask-cuda==23.12.*
+- dask==2023.9.2
+- distributed==2023.9.2
+- doxygen>=1.8.20
+- gcc_linux-aarch64=11.*
+- gmock>=1.13.0
+- graphviz
+- gtest>=1.13.0
+- ipython
+- joblib>=0.11
+- libcublas-dev
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
+- nccl>=2.9.9
+- ninja
+- numba>=0.57
+- numpy>=1.21
+- numpydoc
+- pre-commit
+- pydata-sphinx-theme
+- pytest
+- pytest-cov
+- recommonmark
+- rmm==23.12.*
+- scikit-build>=0.13.1
+- scikit-learn
+- scipy
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sysroot_linux-aarch64==2.17
+- ucx-proc=*=gpu
+- ucx-py==0.35.*
+- ucx>=1.13.0
+name: all_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -0,0 +1,43 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- benchmark>=1.8.2
+- c-compiler
+- clang-tools=16.0.6
+- clang==16.0.6
+- cmake>=3.26.4
+- cuda-profiler-api=11.8.86
+- cuda-version=11.8
+- cudatoolkit
+- cxx-compiler
+- cython>=3.0.0
+- gcc_linux-aarch64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- matplotlib
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- nvcc_linux-aarch64=11.8
+- openblas
+- pandas
+- pyyaml
+- rmm==23.12.*
+- scikit-build>=0.13.1
+- sysroot_linux-aarch64==2.17
+name: bench_ann_cuda-118_arch-aarch64
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -21,6 +21,22 @@
 
 namespace raft::distance::detail::ops {
 
+/**
+ * Reserve 1 digit of precision from each floating-point type
+ * for round-off error tolerance.
+ * @tparam DataT
+ */
+template <typename DataT>
+__device__ constexpr DataT get_clamp_precision()
+{
+  switch (sizeof(DataT)) {
+    case 2: return 1e-3;
+    case 4: return 1e-6;
+    case 8: return 1e-15;
+    default: return 0;
+  }
+}
+
 // Epilogue operator for CUTLASS based kernel
 template <typename DataT, typename AccT>
 struct l2_exp_cutlass_op {
@@ -31,11 +47,13 @@ struct l2_exp_cutlass_op {
   __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
   {
     AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-    // outVal could be negative due to numerical instability, especially when
-    // calculating self distance.
-    // clamp to 0 to avoid potential NaN in sqrt
-    outVal = outVal * (raft::abs(outVal) >= DataT(0.0001));
-    return sqrt ? raft::sqrt(outVal) : outVal;
+
+    /**
+     * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product (accVal)
+     * can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal instead.
+     */
+    outVal = outVal * !((outVal * outVal < get_clamp_precision<DataT>()) * (aNorm == bNorm));
+    return sqrt ? raft::sqrt(outVal * (outVal > 0)) : outVal;
   }
 
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
@@ -86,10 +104,16 @@ struct l2_exp_distance_op {
     for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-        DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
-        // val could be negative due to numerical instability, especially when
-        // calculating self distance. Clamp to 0 to avoid potential NaN in sqrt
-        acc[i][j] = val * (raft::abs(val) >= DataT(0.0001));
+        DataT accVal = acc[i][j];
+        DataT val    = regxn[i] + regyn[j] - (DataT)2.0 * accVal;
+
+        /**
+         * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product
+         * (accVal) can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal
+         * instead.
+         */
+        acc[i][j] =
+          val * (val > 0) * !((val * val < get_clamp_precision<DataT>()) * (regxn[i] == regyn[j]));
       }
     }
     if (sqrt) {

diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -29,6 +29,7 @@
 #include <cstdint>
 #include <iostream>
 #include <raft/core/resources.hpp>
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/map.cuh>
@@ -186,6 +187,7 @@ void tiled_brute_force_knn(const raft::resources& handle,
         auto row_norms = search_norms.data();
         auto col_norms = precomputed_index_norms ? precomputed_index_norms : index_norms.data();
         auto dist      = temp_distances.data();
+        bool sqrt      = metric == raft::distance::DistanceType::L2SqrtExpanded;
 
         raft::linalg::map_offset(
           handle,
@@ -194,15 +196,9 @@ void tiled_brute_force_knn(const raft::resources& handle,
             IndexType row = i + (idx / current_centroid_size);
             IndexType col = j + (idx % current_centroid_size);
 
-            auto val = row_norms[row] + col_norms[col] - 2.0 * dist[idx];
-
-            // due to numerical instability (especially around self-distance)
-            // the distances here could be slightly negative, which will
-            // cause NaN values in the subsequent sqrt. Clamp to 0
-            val = val * (val >= 0.0001);
-            if (metric == raft::distance::DistanceType::L2SqrtExpanded) { val = sqrt(val); }
-            val = distance_epilogue(val, row, col);
-            return val;
+            raft::distance::detail::ops::l2_exp_cutlass_op<ElementType, ElementType> l2_op(sqrt);
+            auto val = l2_op(row_norms[row], col_norms[col], dist[idx]);
+            return distance_epilogue(val, row, col);
           });
       } else if (metric == raft::distance::DistanceType::CosineExpanded) {
         auto row_norms = search_norms.data();

diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
@@ -57,6 +57,7 @@ RAFT_KERNEL naiveKernel(raft::KeyValuePair<int, DataT>* min,
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
+
   if (Sqrt) { acc = raft::sqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<raft::KeyValuePair<int, DataT>> WarpReduce;
@@ -343,7 +344,7 @@ const std::vector<Inputs<double>> inputsd = {
   {0.00001, 128, 32, 33, 1234ULL},  {0.00001, 128, 64, 33, 1234ULL},
   {0.00001, 128, 128, 65, 1234ULL}, {0.00001, 64, 128, 129, 1234ULL},
 
-  {0.00001, 1805, 134, 2, 1234ULL}, {0.00001, 8192, 1024, 25, 1234ULL},
+  {0.00001, 1805, 134, 2, 1234ULL},  //{0.00001, 8192, 1024, 25, 1234ULL},
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
 TEST_P(FusedL2NNTestD_Sq, Result)

diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
@@ -274,7 +274,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                   distances_Cagra,
                                   ps.n_queries,
                                   ps.k,
-                                  0.001,
+                                  0.003,
                                   min_recall));
       EXPECT_TRUE(eval_distances(handle_,
                                  database.data(),
@@ -457,6 +457,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         cagra::index_params index_params;
         index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
                                           // not used for knn_graph building.
+        index_params.nn_descent_niter = 50;
         cagra::search_params search_params;
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
@@ -515,7 +516,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                   distances_Cagra,
                                   ps.n_queries,
                                   ps.k,
-                                  0.001,
+                                  0.003,
                                   min_recall));
       EXPECT_TRUE(eval_distances(handle_,
                                  database.data(),
@@ -571,6 +572,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         cagra::index_params index_params;
         index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
                                           // not used for knn_graph building.
+        index_params.nn_descent_niter = 50;
         cagra::search_params search_params;
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
@@ -628,7 +630,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
                                   distances_Cagra,
                                   ps.n_queries,
                                   ps.k,
-                                  0.001,
+                                  0.003,
                                   min_recall));
       EXPECT_TRUE(eval_distances(handle_,
                                  database.data(),

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -312,7 +312,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
     // reconstructed data.
     compare_vectors_l2(
-      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025);
+      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.04);  // 0.025);
   }
 
   void check_packing(index<IdxT>* index, uint32_t label)

diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -90,6 +90,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam<AnnNNDescentInputs> {
         index_params.metric                    = ps.metric;
         index_params.graph_degree              = ps.graph_degree;
         index_params.intermediate_graph_degree = 2 * ps.graph_degree;
+        index_params.max_iterations            = 50;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -4,7 +4,7 @@ files:
     output: conda
     matrix:
       cuda: ["11.8", "12.0"]
-      arch: [x86_64]
+      arch: [x86_64, aarch64]
     includes:
       - build
       - build_pylibraft
@@ -23,7 +23,7 @@ files:
     output: conda
     matrix:
       cuda: ["11.8"]
-      arch: [x86_64]
+      arch: [x86_64, aarch64]
     includes:
       - build
       - develop