rapidsai · divyegala · Feb 16, 2024 · Feb 16, 2024 · Mar 6, 2024 · Apr 24, 2024
@@ -345,6 +345,10 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
@@ -361,6 +365,10 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
     src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
     src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
     src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
     src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu

@@ -20,6 +20,7 @@ list(APPEND CMAKE_MODULE_PATH "${RAFT_SOURCE_DIR}")
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA "Include faiss' cagra algorithm wrapper in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)
 
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
@@ -52,16 +53,19 @@ if(BUILD_CPU_ONLY)
   include(cmake/thirdparty/get_fmt)
   include(cmake/thirdparty/get_spdlog)
   set(RAFT_FAISS_ENABLE_GPU OFF)
+  set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
+  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
+  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
+  set(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
-elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
-  # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
-  # https://github.com/rapidsai/raft/issues/1627
-  set(RAFT_FAISS_ENABLE_GPU OFF)
+  # elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) # Disable faiss benchmarks
+  # on CUDA 12 since faiss is not yet CUDA 12-enabled. #
+  # https://github.com/rapidsai/raft/issues/1627 set(RAFT_FAISS_ENABLE_GPU ON)
 endif()
 
 set(RAFT_ANN_BENCH_USE_RAFT OFF)
@@ -198,40 +202,25 @@ endfunction()
 # * Configure tests-------------------------------------------------------------
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
-  ConfigureAnnBench(
-    NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib)
 
 endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
   ConfigureAnnBench(
-    NAME
-    RAFT_IVF_PQ
-    PATH
-    src/raft/raft_benchmark.cu
-    src/raft/raft_ivf_pq.cu
-    LINKS
-    raft::compiled
+    NAME RAFT_IVF_PQ PATH src/raft/raft_benchmark.cu src/raft/raft_ivf_pq.cu LINKS raft::compiled
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
   ConfigureAnnBench(
-    NAME
-    RAFT_IVF_FLAT
-    PATH
-    src/raft/raft_benchmark.cu
-    src/raft/raft_ivf_flat.cu
-    LINKS
+    NAME RAFT_IVF_FLAT PATH src/raft/raft_benchmark.cu src/raft/raft_ivf_flat.cu LINKS
     raft::compiled
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE)
-  ConfigureAnnBench(
-    NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled
-  )
+  ConfigureAnnBench(NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled)
 endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
@@ -260,36 +249,37 @@ message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${RAFT_FAISS_TARGETS}
+    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${RAFT_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${RAFT_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${RAFT_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${RAFT_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA AND RAFT_FAISS_ENABLE_GPU)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_CAGRA PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 

@@ -343,6 +343,7 @@ void bench_search(::benchmark::State& state,
 
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
+  // std::cout << "max k: " << dataset->max_k() << std::endl;
   if (dataset->max_k() >= k) {
     const std::int32_t* gt    = dataset->gt_set();
     const std::uint32_t max_k = dataset->max_k();

@@ -71,6 +71,14 @@ void parse_build_param(const nlohmann::json& conf,
   param.quantizer_type = conf.at("quantizer_type");
 }
 
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissCpuHNSW<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+}
+
 template <typename T>
 void parse_search_param(const nlohmann::json& conf,
                         typename raft::bench::ann::FaissCpu<T>::SearchParam& param)
@@ -80,6 +88,14 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
 }
 
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::FaissCpuHNSW<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
 template <typename T, template <typename> class Algo>
 std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
                                                     int dim,
@@ -125,6 +141,8 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
       ann = make_algo<T, raft::bench::ann::FaissCpuIVFSQ>(metric, dim, conf);
     } else if (algo == "faiss_cpu_flat") {
       ann = std::make_unique<raft::bench::ann::FaissCpuFlat<T>>(metric, dim);
+    } else if (algo == "faiss_cpu_hnsw") {
+      ann = make_algo<T, raft::bench::ann::FaissCpuHNSW>(metric, dim, conf);
     }
   }
 
@@ -146,6 +164,10 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
   } else if (algo == "faiss_cpu_flat") {
     auto param = std::make_unique<typename raft::bench::ann::FaissCpu<T>::SearchParam>();
     return param;
+  } else if (algo == "faiss_cpu_hnsw") {
+    auto param = std::make_unique<typename raft::bench::ann::FaissCpuHNSW<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
   }
   // else
   throw std::runtime_error("invalid algo: '" + algo + "'");

@@ -16,15 +16,16 @@
 #pragma once
 
 #include "../common/ann_types.hpp"
-#include "../common/thread_pool.hpp"
 
 #include <raft/core/logger.hpp>
 
 #include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexScalarQuantizer.h>
+#include <faiss/impl/HNSW.h>
 #include <faiss/index_io.h>
 
 #include <cassert>
@@ -73,7 +74,7 @@ class FaissCpu : public ANN<T> {
     static_assert(std::is_same_v<T, float>, "faiss support only float type");
   }
 
-  void build(const T* dataset, size_t nrow) final;
+  virtual void build(const T* dataset, size_t nrow);
 
   void set_search_param(const AnnSearchParam& param) override;
 
@@ -88,11 +89,11 @@ class FaissCpu : public ANN<T> {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(const T* queries,
-              int batch_size,
-              int k,
-              AnnBase::index_type* neighbors,
-              float* distances) const final;
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      AnnBase::index_type* neighbors,
+                      float* distances) const;
 
   AlgoProperty get_preference() const override
   {
@@ -116,9 +117,6 @@ class FaissCpu : public ANN<T> {
   faiss::MetricType metric_type_;
   int nlist_;
   double training_sample_fraction_;
-
-  int num_threads_;
-  std::shared_ptr<FixedThreadPool> thread_pool_;
 };
 
 template <typename T>
@@ -149,12 +147,14 @@ void FaissCpu<T>::build(const T* dataset, size_t nrow)
   index_->train(nrow, dataset);  // faiss::IndexFlat::train() will do nothing
   assert(index_->is_trained);
   index_->add(nrow, dataset);
+  std::cout << "finished adding" << std::endl;
   index_refine_ = std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), dataset);
 }
 
 template <typename T>
 void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
 {
+  std::cout << "should not be here" << std::endl;
   auto search_param = dynamic_cast<const SearchParam&>(param);
   int nprobe        = search_param.nprobe;
   assert(nprobe <= nlist_);
@@ -164,9 +164,12 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
     this->index_refine_.get()->k_factor = search_param.refine_ratio;
   }
 
-  if (!thread_pool_ || num_threads_ != search_param.num_threads) {
-    num_threads_ = search_param.num_threads;
-    thread_pool_ = std::make_shared<FixedThreadPool>(num_threads_);
+  if (param.metric_objective == Objective::LATENCY) {
+    // Let FAISS use its internal threading model with user defined `numThreads`
+    omp_set_num_threads(search_param.num_threads);
+  } else if (param.metric_objective == Objective::THROUGHPUT) {
+    // FAISS is not allowed to internally parallelize
+    omp_set_num_threads(1);
   }
 }
 
@@ -177,12 +180,7 @@ void FaissCpu<T>::search(
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
 
-  thread_pool_->submit(
-    [&](int i) {
-      // Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1.
-      index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
-    },
-    1);
+  index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
 }
 
 template <typename T>
@@ -301,16 +299,6 @@ class FaissCpuFlat : public FaissCpu<T> {
     this->index_ = std::make_shared<faiss::IndexFlat>(dim, this->metric_type_);
   }
 
-  // class FaissCpu is more like a IVF class, so need special treating here
-  void set_search_param(const typename ANN<T>::AnnSearchParam& param) override
-  {
-    auto search_param = dynamic_cast<const typename FaissCpu<T>::SearchParam&>(param);
-    if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) {
-      this->num_threads_ = search_param.num_threads;
-      this->thread_pool_ = std::make_shared<FixedThreadPool>(this->num_threads_);
-    }
-  };
-
   void save(const std::string& file) const override
   {
     this->template save_<faiss::IndexFlat>(file);
@@ -323,4 +311,72 @@ class FaissCpuFlat : public FaissCpu<T> {
   }
 };
 
+template <typename T>
+class FaissCpuHNSW : public FaissCpu<T> {
+ public:
+  struct BuildParam {
+    int M;
+    int ef_construction;
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads;
+  };
+
+  FaissCpuHNSW(Metric metric, int dim, const BuildParam& param)
+    : FaissCpu<T>(metric, dim, typename FaissCpu<T>::BuildParam())
+  {
+    this->index_ = std::make_shared<faiss::IndexHNSWFlat>(param.M, dim, this->metric_type_);
+    dynamic_cast<faiss::IndexHNSWFlat*>(this->index_.get())->hnsw.efConstruction =
+      param.ef_construction;
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::IndexHNSWFlat>(file);
+  }
+  void load(const std::string& file) override { this->template load_<faiss::IndexHNSWFlat>(file); }
+
+  std::unique_ptr<ANN<T>> copy()
+  {
+    return std::make_unique<FaissCpuHNSW<T>>(*this);  // use copy constructor
+  }
+
+  void set_search_param(const AnnSearchParam& param) override
+  {
+    auto search_param = dynamic_cast<const SearchParam&>(param);
+    if (search_param.metric_objective == Objective::LATENCY) {
+      // Let FAISS use its internal threading model with user defined `numThreads`
+      omp_set_num_threads(search_param.num_threads);
+    } else if (search_param.metric_objective == Objective::THROUGHPUT) {
+      // FAISS is not allowed to internally parallelize
+      omp_set_num_threads(1);
+    }
+    search_params_.efSearch = search_param.ef;
+  }
+
+  void build(const T* dataset, size_t nrow) override
+  {
+    this->index_->train(nrow, dataset);  // faiss::IndexHNSWFlat::train() will do nothing
+    assert(this->index_->is_trained);
+    this->index_->add(nrow, dataset);
+    std::cout << "finished adding" << std::endl;
+  }
+
+  void search(
+    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override
+  {
+    this->index_->search(batch_size,
+                         queries,
+                         k,
+                         distances,
+                         reinterpret_cast<faiss::idx_t*>(neighbors),
+                         &search_params_);
+  }
+
+ private:
+  faiss::SearchParametersHNSW search_params_;
+};
 }  // namespace raft::bench::ann