From 5f740abfe425a020b0d59c2dc1d4141c0ee224df Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 20 Oct 2023 18:24:18 -0400
Subject: [PATCH 01/19] Adding throughput mode to benhcmarksy

---
 cpp/bench/ann/src/common/ann_types.hpp        |   5 +
 cpp/bench/ann/src/common/benchmark.hpp        | 118 ++++++++++++++----
 cpp/bench/ann/src/common/thread_pool.hpp      |  73 +++++++++++
 .../src/raft-ann-bench/run/__main__.py        |  12 ++
 4 files changed, 187 insertions(+), 21 deletions(-)
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index 33716bd45a..d32c8b64b3 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -24,6 +24,11 @@
 
 namespace raft::bench::ann {
 
+enum Objective {
+  THROUGHPUT,  // See how many vectors we can push through
+  LATENCY      // See how many
+};
+
 enum class MemoryType {
   Host,
   HostMmap,
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 4ec977700d..4991bcae71 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -22,6 +22,9 @@
 
 #include <benchmark/benchmark.h>
 
+#include <rmm/cuda_stream_pool.hpp>
+
+#include "thread_pool.hpp"
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -164,7 +167,8 @@ template <typename T>
 void bench_search(::benchmark::State& state,
                   std::shared_ptr<const Dataset<T>> dataset,
                   Configuration::Index index,
-                  std::size_t search_param_ix)
+                  std::size_t search_param_ix,
+                  Objective metric_objective)
 {
   const auto& sp_json = index.search_params[search_param_ix];
   dump_parameters(state, sp_json);
@@ -221,21 +225,52 @@ void bench_search(::benchmark::State& state,
 
   std::ptrdiff_t batch_offset   = 0;
   std::size_t queries_processed = 0;
+
   cuda_timer gpu_timer;
+
   {
+    /**
+     * When the objective is throughput, we want to overlap batches
+     * as much as possible and measure the end-to-end time from start
+     * to finish.
+     *
+     * When the objective is latency, we want to measure each batch
+     * individually. Latency is better measured in single-query batches
+     * but larger batches are still allowed in this mode in order to
+     * compare against the resulting batch sizes in throughput mode.
+     */
+    bool overlap_queries = metric_objective == Objective::THROUGHPUT;
+    ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0);
+
+    // TODO: Make this configurable
+    rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 0);
+
     nvtx_case nvtx{state.name()};
+
     for (auto _ : state) {
-      // measure the GPU time using the RAII helper
-      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      if (metric_objective == Objective::LATENCY) {
+        // measure the GPU time using the RAII helper
+        [[maybe_unused]] auto ntx_lap = nvtx.lap();
+        [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      }
       // run the search
       try {
-        algo->search(query_set + batch_offset * dataset->dim(),
-                     n_queries,
-                     k,
-                     neighbors.data + batch_offset * k,
-                     distances.data + batch_offset * k,
-                     gpu_timer.stream());
+        cudaStream_t stream =
+          overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream();
+        auto f = [=]() {
+          algo->search(query_set + batch_offset * dataset->dim(),
+                       n_queries,
+                       k,
+                       neighbors.data + batch_offset * k,
+                       distances.data + batch_offset * k,
+                       stream);
+        };
+
+        if (overlap_queries) {
+          thread_pool.run(f);
+        } else {
+          f();
+        }
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
@@ -243,7 +278,23 @@ void bench_search(::benchmark::State& state,
       batch_offset = (batch_offset + n_queries) % query_set_size;
       queries_processed += n_queries;
     }
+
+    /**
+     * If in throughput mode, wait for all results
+     */
+    if (metric_objective == Objective::THROUGHPUT) {
+      for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) {
+        cudaStreamSynchronize(stream_pool.get_stream(i));
+      }
+
+      thread_pool.synchronize();
+
+      // measure the GPU time using the RAII helper
+      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+    }
   }
+
   state.SetItemsProcessed(queries_processed);
   state.counters.insert({{"k", k}, {"n_queries", n_queries}});
   if (cudart.found()) {
@@ -325,14 +376,15 @@ void register_build(std::shared_ptr<const Dataset<T>> dataset,
 
 template <typename T>
 void register_search(std::shared_ptr<const Dataset<T>> dataset,
-                     std::vector<Configuration::Index> indices)
+                     std::vector<Configuration::Index> indices,
+                     Objective metric_objective)
 {
   for (auto index : indices) {
     for (std::size_t i = 0; i < index.search_params.size(); i++) {
       auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
       index.search_params[i].erase("override_suffix");
-      auto* b =
-        ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, dataset, index, i);
+      auto* b = ::benchmark::RegisterBenchmark(
+        index.name + suf, bench_search<T>, dataset, index, i, metric_objective);
       b->Unit(benchmark::kMillisecond);
       b->UseRealTime();
     }
@@ -346,7 +398,8 @@ void dispatch_benchmark(const Configuration& conf,
                         bool search_mode,
                         std::string data_prefix,
                         std::string index_prefix,
-                        kv_series override_kv)
+                        kv_series override_kv,
+                        Objective metric_objective)
 {
   if (cudart.found()) {
     for (auto [key, value] : cuda_info()) {
@@ -414,7 +467,7 @@ void dispatch_benchmark(const Configuration& conf,
       index.search_params = apply_overrides(index.search_params, override_kv);
       index.file          = combine_path(index_prefix, index.file);
     }
-    register_search<T>(dataset, indices);
+    register_search<T>(dataset, indices, metric_objective);
   }
 }
 
@@ -445,6 +498,7 @@ inline auto run_main(int argc, char** argv) -> int
   std::string data_prefix     = "data";
   std::string index_prefix    = "index";
   std::string new_override_kv = "";
+  std::string mode            = "latency";
   kv_series override_kv{};
 
   char arg0_default[] = "benchmark";  // NOLINT
@@ -467,6 +521,7 @@ inline auto run_main(int argc, char** argv) -> int
         parse_bool_flag(argv[i], "--search", search_mode) ||
         parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
         parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
+        parse_string_flag(argv[i], "--mode", mode) ||
         parse_string_flag(argv[i], "--override_kv", new_override_kv)) {
       if (!new_override_kv.empty()) {
         auto kvv = split(new_override_kv, ':');
@@ -486,6 +541,9 @@ inline auto run_main(int argc, char** argv) -> int
     }
   }
 
+  Objective metric_objective = Objective::LATENCY;
+  if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
+
   if (build_mode == search_mode) {
     log_error("One and only one of --build and --search should be specified");
     printf_usage();
@@ -505,14 +563,32 @@ inline auto run_main(int argc, char** argv) -> int
   std::string dtype = conf.get_dataset_conf().dtype;
 
   if (dtype == "float") {
-    dispatch_benchmark<float>(
-      conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv);
+    dispatch_benchmark<float>(conf,
+                              force_overwrite,
+                              build_mode,
+                              search_mode,
+                              data_prefix,
+                              index_prefix,
+                              override_kv,
+                              metric_objective);
   } else if (dtype == "uint8") {
-    dispatch_benchmark<std::uint8_t>(
-      conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv);
+    dispatch_benchmark<std::uint8_t>(conf,
+                                     force_overwrite,
+                                     build_mode,
+                                     search_mode,
+                                     data_prefix,
+                                     index_prefix,
+                                     override_kv,
+                                     metric_objective);
   } else if (dtype == "int8") {
-    dispatch_benchmark<std::int8_t>(
-      conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv);
+    dispatch_benchmark<std::int8_t>(conf,
+                                    force_overwrite,
+                                    build_mode,
+                                    search_mode,
+                                    data_prefix,
+                                    index_prefix,
+                                    override_kv,
+                                    metric_objective);
   } else {
     log_error("datatype '%s' is not supported", dtype.c_str());
     return -1;
diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
index efea938d5b..def38ec0b5 100644
--- a/cpp/bench/ann/src/common/thread_pool.hpp
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <future>
 #include <memory>
 #include <mutex>
 #include <omp.h>
+#include <queue>
 #include <stdexcept>
 #include <thread>
 #include <utility>
@@ -72,6 +74,7 @@ class FixedThreadPool {
   template <typename Func, typename IdxT>
   void submit(Func f, IdxT len)
   {
+    // Run functions in main thread if thread pool has no threads
     if (threads_.empty()) {
       for (IdxT i = 0; i < len; ++i) {
         f(i);
@@ -84,6 +87,7 @@ class FixedThreadPool {
     const IdxT items_per_thread = len / (num_threads + 1);
     std::atomic<IdxT> cnt(items_per_thread * num_threads);
 
+    // Wrap function
     auto wrapped_f = [&](IdxT start, IdxT end) {
       for (IdxT i = start; i < end; ++i) {
         f(i);
@@ -129,3 +133,72 @@ class FixedThreadPool {
   std::vector<std::thread> threads_;
   std::atomic<bool> finished_{false};
 };
+
+class ThreadPool {
+ public:
+  ThreadPool(unsigned num_threads = std::thread::hardware_concurrency())
+  {
+    while (num_threads--) {
+      threads.emplace_back([this] {
+        while (true) {
+          std::unique_lock<std::mutex> lock(mutex);
+          condvar.wait(lock, [this] { return !queue.empty(); });
+          auto task = std::move(queue.front());
+          if (task.valid()) {
+            queue.pop();
+            lock.unlock();
+            // run the task - this cannot throw; any exception
+            // will be stored in the corresponding future
+            task();
+          } else {
+            // an empty task is used to signal end of stream
+            // don't pop it off the top; all threads need to see it
+            break;
+          }
+        }
+      });
+    }
+  }
+
+  template <typename F, typename R = std::result_of_t<F && ()>>
+  std::future<R> run(F&& f) const
+  {
+    auto task   = std::packaged_task<R()>(std::forward<F>(f));
+    auto future = task.get_future();
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      // conversion to packaged_task<void()> erases the return type
+      // so it can be stored in the queue. the future will still
+      // contain the correct type
+      queue.push(std::packaged_task<void()>(std::move(task)));
+    }
+    condvar.notify_one();
+    return future;
+  }
+
+  void synchronize()
+  {
+    condvar.notify_all();
+    for (auto& thread : threads) {
+      thread.join();
+    }
+  }
+
+  ~ThreadPool()
+  {
+    // push a single empty task onto the queue and notify all threads,
+    // then wait for them to terminate
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      queue.push({});
+    }
+
+    synchronize();
+  }
+
+ private:
+  std::vector<std::thread> threads;
+  mutable std::queue<std::packaged_task<void()>> queue;
+  mutable std::mutex mutex;
+  mutable std::condition_variable condvar;
+};
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index a0d4fabb77..d1f85f05db 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -78,6 +78,7 @@ def run_build_and_search(
     search,
     k,
     batch_size,
+    mode="throughput",
 ):
     for executable, ann_executable_path, algo in executables_to_run.keys():
         # Need to write temporary configuration
@@ -126,6 +127,7 @@ def run_build_and_search(
                 "--benchmark_out_format=json",
                 "--benchmark_out="
                 + f"{os.path.join(search_folder, f'{algo}.json')}",
+                "--mode=" % str(mode),
             ]
             if force:
                 cmd = cmd + ["--overwrite"]
@@ -211,6 +213,14 @@ def main():
         action="store_true",
     )
 
+    parser.add_argument(
+        "-m",
+        "--search_mode",
+        help="run search in 'latency' (measure individual batches) or "
+        "'throughput' (pipeline batches and measure end-to-end) mode",
+        default="throughput",
+    )
+
     args = parser.parse_args()
 
     # If both build and search are not provided,
@@ -222,6 +232,7 @@ def main():
         build = args.build
         search = args.search
 
+    mode = args.search_mode
     k = args.count
     batch_size = args.batch_size
 
@@ -316,6 +327,7 @@ def main():
         search,
         k,
         batch_size,
+        mode,
     )
 
 

From 1f75edecad62a2f5f41bac2699c638a8e7f16f07 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 20 Oct 2023 18:42:40 -0400
Subject: [PATCH 02/19] Adding docs

---
 cpp/bench/ann/src/common/benchmark.hpp                   | 6 +++++-
 cpp/bench/ann/src/common/thread_pool.hpp                 | 1 +
 python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 4991bcae71..3dedbdd014 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -339,6 +339,7 @@ inline void printf_usage()
     "          [--data_prefix=<prefix>]\n"
     "          [--index_prefix=<prefix>]\n"
     "          [--override_kv=<key:value1:value2:...:valueN>]\n"
+    "          [--mode=<latency|throughput>\n"
     "          <conf>.json\n"
     "\n"
     "Note the non-standard benchmark parameters:\n"
@@ -353,7 +354,10 @@ inline void printf_usage()
     "  --override_kv=<key:value1:value2:...:valueN>:"
     " override a build/search key one or more times multiplying the number of configurations;"
     " you can use this parameter multiple times to get the Cartesian product of benchmark"
-    " configs.\n");
+    " configs.\n"
+    "  --mode=<latency|throughput>"
+    "     run the benchmarks in latency (accumulate times spent in each batch) or "
+    "     throughput (pipeline batches and measure end-to-end) mode\n");
 }
 
 template <typename T>
diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
index def38ec0b5..d904be2000 100644
--- a/cpp/bench/ann/src/common/thread_pool.hpp
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -188,6 +188,7 @@ class ThreadPool {
   {
     // push a single empty task onto the queue and notify all threads,
     // then wait for them to terminate
+
     {
       std::lock_guard<std::mutex> lock(mutex);
       queue.push({});
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index d1f85f05db..02cfadf809 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -127,7 +127,7 @@ def run_build_and_search(
                 "--benchmark_out_format=json",
                 "--benchmark_out="
                 + f"{os.path.join(search_folder, f'{algo}.json')}",
-                "--mode=" % str(mode),
+                "--mode=%s" % mode,
             ]
             if force:
                 cmd = cmd + ["--overwrite"]

From c11685c3d46d8acd67d9082901eafea16dfa82ad Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 20 Oct 2023 21:35:36 -0400
Subject: [PATCH 03/19] Will probably have to integrate stream logic into the
 threads of the thread pool

---
 cpp/bench/ann/CMakeLists.txt                  | 24 +++++++-------
 cpp/bench/ann/src/common/ann_types.hpp        |  3 +-
 cpp/bench/ann/src/common/benchmark.hpp        | 33 +++++++++++++++----
 cpp/bench/ann/src/common/thread_pool.hpp      | 15 +++------
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h   | 26 ++++++++++-----
 cpp/bench/ann/src/raft/raft_benchmark.cu      | 10 +++++-
 .../src/raft-ann-bench/run/__main__.py        |  1 +
 7 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 502f371a25..1f98cdbe88 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,18 +15,22 @@
 # ##################################################################################################
 # * benchmark options ------------------------------------------------------------------------------
 
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark"
+       OFF
+)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT
-       "Include faiss' cpu brute-force knn algorithm in benchmark" ON
+       "Include faiss' cpu brute-force knn algorithm in benchmark" OFF
+)
+option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark"
+       OFF
 )
-option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)
 
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
-       ON
+       OFF
 )
-option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" OFF)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
@@ -106,10 +110,8 @@ if(RAFT_ANN_BENCH_USE_GGNN)
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS)
-  # We need to ensure that faiss has all the conda
-  # information. So we currently use the very ugly
-  # hammer of `link_libraries` to ensure that all
-  # targets in this directory and the faiss directory
+  # We need to ensure that faiss has all the conda information. So we currently use the very ugly
+  # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory
   # will have the conda includes/link dirs
   link_libraries($<TARGET_NAME_IF_EXISTS:conda_env>)
   include(cmake/thirdparty/get_faiss.cmake)
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index d32c8b64b3..f192d56e89 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -84,7 +84,8 @@ template <typename T>
 class ANN : public AnnBase {
  public:
   struct AnnSearchParam {
-    virtual ~AnnSearchParam() = default;
+    Objective metric_objective = Objective::LATENCY;
+    virtual ~AnnSearchParam()  = default;
     [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; };
   };
 
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 3dedbdd014..e5b2014678 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -20,6 +20,8 @@
 #include "dataset.hpp"
 #include "util.hpp"
 
+#include <raft/util/cudart_utils.hpp>
+
 #include <benchmark/benchmark.h>
 
 #include <rmm/cuda_stream_pool.hpp>
@@ -29,6 +31,7 @@
 #include <cmath>
 #include <cstdint>
 #include <fstream>
+#include <future>
 #include <limits>
 #include <memory>
 #include <numeric>
@@ -200,7 +203,8 @@ void bench_search(::benchmark::State& state,
       algo->load(index_file);
       current_algo = std::move(ualgo);
     }
-    search_param = ann::create_search_param<T>(index.algo, sp_json);
+    search_param                   = ann::create_search_param<T>(index.algo, sp_json);
+    search_param->metric_objective = metric_objective;
   } catch (const std::exception& e) {
     return state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
   }
@@ -228,7 +232,10 @@ void bench_search(::benchmark::State& state,
 
   cuda_timer gpu_timer;
 
+  uint32_t start = raft::curTimeMillis();
+
   {
+    std::vector<std::future<void>> futures;
     /**
      * When the objective is throughput, we want to overlap batches
      * as much as possible and measure the end-to-end time from start
@@ -243,10 +250,11 @@ void bench_search(::benchmark::State& state,
     ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0);
 
     // TODO: Make this configurable
-    rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 0);
+    rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 1);
 
     nvtx_case nvtx{state.name()};
 
+    printf("Running state...\n");
     for (auto _ : state) {
       if (metric_objective == Objective::LATENCY) {
         // measure the GPU time using the RAII helper
@@ -257,7 +265,7 @@ void bench_search(::benchmark::State& state,
       try {
         cudaStream_t stream =
           overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream();
-        auto f = [=]() {
+        auto f = [&]() {
           algo->search(query_set + batch_offset * dataset->dim(),
                        n_queries,
                        k,
@@ -267,7 +275,7 @@ void bench_search(::benchmark::State& state,
         };
 
         if (overlap_queries) {
-          thread_pool.run(f);
+          futures.emplace_back(thread_pool.run(f));
         } else {
           f();
         }
@@ -279,22 +287,35 @@ void bench_search(::benchmark::State& state,
       queries_processed += n_queries;
     }
 
+    printf("Done running state\n");
+
     /**
      * If in throughput mode, wait for all results
      */
     if (metric_objective == Objective::THROUGHPUT) {
+      // Synchronize all the threads.
+      for (auto& f : futures) {
+        f.wait();
+      }
+
+      std::cout << "Futures size: " << futures.size() << std::endl;
+      futures.clear();
+
       for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) {
         cudaStreamSynchronize(stream_pool.get_stream(i));
       }
 
-      thread_pool.synchronize();
-
+      printf("collecting metrics\n");
       // measure the GPU time using the RAII helper
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
     }
   }
 
+  uint32_t stop = raft::curTimeMillis();
+
+  std::cout << "Took: " << (stop - start) << std::endl;
+
   state.SetItemsProcessed(queries_processed);
   state.counters.insert({{"k", k}, {"n_queries", n_queries}});
   if (cudart.found()) {
diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
index d904be2000..c28b315919 100644
--- a/cpp/bench/ann/src/common/thread_pool.hpp
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -138,6 +138,7 @@ class ThreadPool {
  public:
   ThreadPool(unsigned num_threads = std::thread::hardware_concurrency())
   {
+    if (num_threads <= 0) { return; }
     while (num_threads--) {
       threads.emplace_back([this] {
         while (true) {
@@ -176,14 +177,6 @@ class ThreadPool {
     return future;
   }
 
-  void synchronize()
-  {
-    condvar.notify_all();
-    for (auto& thread : threads) {
-      thread.join();
-    }
-  }
-
   ~ThreadPool()
   {
     // push a single empty task onto the queue and notify all threads,
@@ -193,8 +186,10 @@ class ThreadPool {
       std::lock_guard<std::mutex> lock(mutex);
       queue.push({});
     }
-
-    synchronize();
+    condvar.notify_all();
+    for (auto& thread : threads) {
+      thread.join();
+    }
   }
 
  private:
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index df44605493..81c50f2c2e 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -103,6 +103,7 @@ class HnswLib : public ANN<T> {
   int m_;
   int num_threads_;
   std::unique_ptr<FixedThreadPool> thread_pool_;
+  Objective metric_objective_;
 };
 
 template <typename T>
@@ -159,10 +160,12 @@ void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
 template <typename T>
 void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
 {
-  auto param     = dynamic_cast<const SearchParam&>(param_);
-  appr_alg_->ef_ = param.ef;
+  auto param        = dynamic_cast<const SearchParam&>(param_);
+  appr_alg_->ef_    = param.ef;
+  metric_objective_ = param.metric_objective;
 
-  if (!thread_pool_ || num_threads_ != param.num_threads) {
+  if (metric_objective_ != Objective::LATENCY &&
+      (!thread_pool_ || num_threads_ != param.num_threads)) {
     num_threads_ = param.num_threads;
     thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
   }
@@ -172,12 +175,17 @@ template <typename T>
 void HnswLib<T>::search(
   const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const
 {
-  thread_pool_->submit(
-    [&](int i) {
-      // hnsw can only handle a single vector at a time.
-      get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
-    },
-    batch_size);
+  auto f = [&](int i) {
+    // hnsw can only handle a single vector at a time.
+    get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
+  };
+  if (metric_objective_ == Objective::LATENCY) {
+    thread_pool_->submit(f, batch_size);
+  } else {
+    for (int i = 0; i < batch_size; i++) {
+      f(i);
+    }
+  }
 }
 
 template <typename T>
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index fa20c5c223..3b9bcc7e15 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -272,5 +272,13 @@ REGISTER_ALGO_INSTANCE(std::uint8_t);
 
 #ifdef ANN_BENCH_BUILD_MAIN
 #include "../common/benchmark.hpp"
-int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
+int main(int argc, char** argv)
+{
+  rmm::mr::cuda_memory_resource cuda_mr;
+  // Construct a resource that uses a coalescing best-fit pool allocator
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{&cuda_mr};
+  rmm::mr::set_current_device_resource(
+    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
+  return raft::bench::ann::run_main(argc, argv);
+}
 #endif
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 02cfadf809..05858d0ce9 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -121,6 +121,7 @@ def run_build_and_search(
                 "--search",
                 "--data_prefix=" + dataset_path,
                 "--benchmark_counters_tabular",
+                "--benchmark_min_time=1x",
                 "--override_kv=k:%s" % k,
                 "--override_kv=n_queries:%s" % batch_size,
                 "--benchmark_min_warmup_time=0.01",

From 7d8c907fe0e4e25b05789c54d14824c43b4bf022 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 22 Oct 2023 19:56:45 -0400
Subject: [PATCH 04/19] Still some work to do and need to understand how
 googlebench multithreads the processing a bit better, but so far things are
 looking good.

---
 cpp/bench/ann/src/common/benchmark.hpp        | 154 +++++++-----------
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   |   5 +-
 .../src/raft-ann-bench/run/__main__.py        |   2 +-
 3 files changed, 60 insertions(+), 101 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index e5b2014678..50d3ef00b1 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -168,13 +168,16 @@ void bench_build(::benchmark::State& state,
 
 template <typename T>
 void bench_search(::benchmark::State& state,
-                  std::shared_ptr<const Dataset<T>> dataset,
                   Configuration::Index index,
                   std::size_t search_param_ix,
-                  Objective metric_objective)
+                  std::shared_ptr<const Dataset<T>> dataset)
 {
+  std::ptrdiff_t batch_offset = 0;
+  std::atomic<std::size_t> queries_processed(0);
+
   const auto& sp_json = index.search_params[search_param_ix];
-  dump_parameters(state, sp_json);
+
+  if (state.thread_index() == 0) { dump_parameters(state, sp_json); }
 
   // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp
   const std::uint32_t k = sp_json["k"];
@@ -184,7 +187,7 @@ void bench_search(::benchmark::State& state,
   const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries;
 
   if (!file_exists(index.file)) {
-    state.SkipWithError("Index file is missing. Run the benchmark in the build mode first.");
+    throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first.");
     return;
   }
   // algo is static to cache it between close search runs to save time on index loading
@@ -193,7 +196,8 @@ void bench_search(::benchmark::State& state,
     current_algo.reset();
     index_file = index.file;
   }
-  ANN<T>* algo;
+
+  ANN<T>* algo;  // TODO: Just have one thread load this.
   std::unique_ptr<typename ANN<T>::AnnSearchParam> search_param;
   try {
     if (!current_algo || (algo = dynamic_cast<ANN<T>*>(current_algo.get())) == nullptr) {
@@ -203,15 +207,17 @@ void bench_search(::benchmark::State& state,
       algo->load(index_file);
       current_algo = std::move(ualgo);
     }
-    search_param                   = ann::create_search_param<T>(index.algo, sp_json);
-    search_param->metric_objective = metric_objective;
+    search_param = ann::create_search_param<T>(index.algo, sp_json);
+    //        search_param->metric_objective = metric_objective;
   } catch (const std::exception& e) {
-    return state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
+    throw std::runtime_error("Failed to create an algo: " + std::string(e.what()));
   }
   algo->set_search_param(*search_param);
 
   const auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
   const T* query_set       = dataset->query_set(algo_property.query_memory_type);
+
+  // TODO: Have 1 thread create and load these.
   buf<float> distances{algo_property.query_memory_type, k * query_set_size};
   buf<std::size_t> neighbors{algo_property.query_memory_type, k * query_set_size};
 
@@ -220,22 +226,15 @@ void bench_search(::benchmark::State& state,
       algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
                                dataset->base_set_size());
     } catch (const std::exception& ex) {
-      state.SkipWithError("The algorithm '" + index.name +
-                          "' requires the base set, but it's not available. " +
-                          "Exception: " + std::string(ex.what()));
+      throw std::runtime_error("The algorithm '" + index.name +
+                               "' requires the base set, but it's not available. " +
+                               "Exception: " + std::string(ex.what()));
       return;
     }
   }
 
-  std::ptrdiff_t batch_offset   = 0;
-  std::size_t queries_processed = 0;
-
   cuda_timer gpu_timer;
-
-  uint32_t start = raft::curTimeMillis();
-
   {
-    std::vector<std::future<void>> futures;
     /**
      * When the objective is throughput, we want to overlap batches
      * as much as possible and measure the end-to-end time from start
@@ -246,39 +245,21 @@ void bench_search(::benchmark::State& state,
      * but larger batches are still allowed in this mode in order to
      * compare against the resulting batch sizes in throughput mode.
      */
-    bool overlap_queries = metric_objective == Objective::THROUGHPUT;
-    ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0);
-
-    // TODO: Make this configurable
-    rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 1);
 
     nvtx_case nvtx{state.name()};
 
-    printf("Running state...\n");
+    // Multithreading starts in the benchmark loop
     for (auto _ : state) {
-      if (metric_objective == Objective::LATENCY) {
-        // measure the GPU time using the RAII helper
-        [[maybe_unused]] auto ntx_lap = nvtx.lap();
-        [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
-      }
+      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
       // run the search
       try {
-        cudaStream_t stream =
-          overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream();
-        auto f = [&]() {
-          algo->search(query_set + batch_offset * dataset->dim(),
-                       n_queries,
-                       k,
-                       neighbors.data + batch_offset * k,
-                       distances.data + batch_offset * k,
-                       stream);
-        };
-
-        if (overlap_queries) {
-          futures.emplace_back(thread_pool.run(f));
-        } else {
-          f();
-        }
+        algo->search(query_set + batch_offset * dataset->dim(),
+                     n_queries,
+                     k,
+                     neighbors.data + batch_offset * k,
+                     distances.data + batch_offset * k,
+                     gpu_timer.stream());
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
@@ -286,67 +267,42 @@ void bench_search(::benchmark::State& state,
       batch_offset = (batch_offset + n_queries) % query_set_size;
       queries_processed += n_queries;
     }
-
-    printf("Done running state\n");
-
-    /**
-     * If in throughput mode, wait for all results
-     */
-    if (metric_objective == Objective::THROUGHPUT) {
-      // Synchronize all the threads.
-      for (auto& f : futures) {
-        f.wait();
-      }
-
-      std::cout << "Futures size: " << futures.size() << std::endl;
-      futures.clear();
-
-      for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) {
-        cudaStreamSynchronize(stream_pool.get_stream(i));
-      }
-
-      printf("collecting metrics\n");
-      // measure the GPU time using the RAII helper
-      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
-    }
   }
 
-  uint32_t stop = raft::curTimeMillis();
-
-  std::cout << "Took: " << (stop - start) << std::endl;
-
+  state.counters.insert({{"total_queries", queries_processed.load()}});
   state.SetItemsProcessed(queries_processed);
-  state.counters.insert({{"k", k}, {"n_queries", n_queries}});
   if (cudart.found()) {
     state.counters.insert({{"GPU Time", gpu_timer.total_time() / state.iterations()},
-                           {"GPU QPS", queries_processed / gpu_timer.total_time()}});
+                           {"GPU QPS", queries_processed.load() / gpu_timer.total_time()}});
   }
   if (state.skipped()) { return; }
 
-  // evaluate recall
-  if (dataset->max_k() >= k) {
-    const std::int32_t* gt          = dataset->gt_set();
-    const std::uint32_t max_k       = dataset->max_k();
-    buf<std::size_t> neighbors_host = neighbors.move(MemoryType::Host);
-
-    std::size_t rows        = std::min(queries_processed, query_set_size);
-    std::size_t match_count = 0;
-    std::size_t total_count = rows * static_cast<size_t>(k);
-    for (std::size_t i = 0; i < rows; i++) {
-      for (std::uint32_t j = 0; j < k; j++) {
-        auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
-        for (std::uint32_t l = 0; l < k; l++) {
-          auto exp_idx = gt[i * max_k + l];
-          if (act_idx == exp_idx) {
-            match_count++;
-            break;
+  if (state.thread_index() == 0) {
+    state.counters.insert({{"k", k}, {"batch_size", n_queries}});
+    // evaluate recall
+    if (dataset->max_k() >= k) {
+      const std::int32_t* gt          = dataset->gt_set();
+      const std::uint32_t max_k       = dataset->max_k();
+      buf<std::size_t> neighbors_host = neighbors.move(MemoryType::Host);
+
+      std::size_t rows        = std::min(queries_processed.load(), query_set_size);
+      std::size_t match_count = 0;
+      std::size_t total_count = rows * static_cast<size_t>(k);
+      for (std::size_t i = 0; i < rows; i++) {
+        for (std::uint32_t j = 0; j < k; j++) {
+          auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
+          for (std::uint32_t l = 0; l < k; l++) {
+            auto exp_idx = gt[i * max_k + l];
+            if (act_idx == exp_idx) {
+              match_count++;
+              break;
+            }
           }
         }
       }
+      double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+      state.counters.insert({{"Recall", actual_recall}});
     }
-    double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-    state.counters.insert({{"Recall", actual_recall}});
   }
 }
 
@@ -408,10 +364,12 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
     for (std::size_t i = 0; i < index.search_params.size(); i++) {
       auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
       index.search_params[i].erase("override_suffix");
-      auto* b = ::benchmark::RegisterBenchmark(
-        index.name + suf, bench_search<T>, dataset, index, i, metric_objective);
-      b->Unit(benchmark::kMillisecond);
-      b->UseRealTime();
+
+      auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, index, i, dataset)
+                  ->Unit(benchmark::kMillisecond)
+                  ->ThreadRange(1, 16)
+                  ->UseRealTime();
+      std::cout << "Done registering index " << i << std::endl;
     }
   }
 }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 19c5151186..a22cb41d46 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -56,7 +56,8 @@ class RaftCagra : public ANN<T> {
     : ANN<T>(metric, dim),
       index_params_(param),
       dimension_(dim),
-      mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+      mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull),
+      handle_(cudaStreamPerThread)
   {
     rmm::mr::set_current_device_resource(&mr_);
     index_params_.metric = parse_metric_type(metric);
@@ -170,7 +171,7 @@ void RaftCagra<T, IdxT>::search(
                           neighbors_IdxT,
                           batch_size * k,
                           raft::cast_op<size_t>(),
-                          resource::get_cuda_stream(handle_));
+                          raft::resource::get_cuda_stream(handle_));
   }
 
   handle_.sync_stream();
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 05858d0ce9..e0583d7708 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -121,7 +121,7 @@ def run_build_and_search(
                 "--search",
                 "--data_prefix=" + dataset_path,
                 "--benchmark_counters_tabular",
-                "--benchmark_min_time=1x",
+                # "--benchmark_min_time=1x",
                 "--override_kv=k:%s" % k,
                 "--override_kv=n_queries:%s" % batch_size,
                 "--benchmark_min_warmup_time=0.01",

From 2d6e0ca3080d95a6c3c97a3b0b7ee93635efb37c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 00:14:12 -0400
Subject: [PATCH 05/19] Successfully having one thread load the data

---
 cpp/bench/ann/src/common/ann_types.hpp      |   9 +-
 cpp/bench/ann/src/common/benchmark.hpp      | 689 ++++++++++----------
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h |   2 +-
 3 files changed, 365 insertions(+), 335 deletions(-)

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index f192d56e89..f82afb35e9 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -64,10 +64,17 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
   }
 }
 
-struct AlgoProperty {
+class AlgoProperty {
+ public:
+  inline AlgoProperty() {}
+  inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_)
+    : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_)
+  {
+  }
   MemoryType dataset_memory_type;
   // neighbors/distances should have same memory type as queries
   MemoryType query_memory_type;
+  virtual ~AlgoProperty() = default;
 };
 
 class AnnBase {
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 50d3ef00b1..5c755108af 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -28,6 +28,7 @@
 
 #include "thread_pool.hpp"
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <fstream>
@@ -42,6 +43,7 @@
 namespace raft::bench::ann {
 
 static inline std::unique_ptr<AnnBase> current_algo{nullptr};
+static inline std::shared_ptr<AlgoProperty> current_algo_props{nullptr};
 
 using kv_series = std::vector<std::tuple<std::string, std::vector<nlohmann::json>>>;
 
@@ -170,10 +172,13 @@ template <typename T>
 void bench_search(::benchmark::State& state,
                   Configuration::Index index,
                   std::size_t search_param_ix,
-                  std::shared_ptr<const Dataset<T>> dataset)
+                  std::shared_ptr<const Dataset<T>> dataset,
+                  Objective metric_objective)
 {
-  std::ptrdiff_t batch_offset = 0;
-  std::atomic<std::size_t> queries_processed(0);
+  std::ptrdiff_t batch_offset   = 0;
+  std::size_t queries_processed = 0;
+
+  double total_time = 0;
 
   const auto& sp_json = index.search_params[search_param_ix];
 
@@ -190,381 +195,381 @@ void bench_search(::benchmark::State& state,
     throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first.");
     return;
   }
-  // algo is static to cache it between close search runs to save time on index loading
-  static std::string index_file = "";
-  if (index.file != index_file) {
-    current_algo.reset();
-    index_file = index.file;
-  }
 
-  ANN<T>* algo;  // TODO: Just have one thread load this.
-  std::unique_ptr<typename ANN<T>::AnnSearchParam> search_param;
-  try {
-    if (!current_algo || (algo = dynamic_cast<ANN<T>*>(current_algo.get())) == nullptr) {
-      auto ualgo = ann::create_algo<T>(
-        index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list);
-      algo = ualgo.get();
-      algo->load(index_file);
-      current_algo = std::move(ualgo);
+  /**
+   * Make sure the first thread loads the algo and dataset
+   */
+  if (state.thread_index() == 0) {
+    // algo is static to cache it between close search runs to save time on index loading
+    static std::string index_file = "";
+    if (index.file != index_file) {
+      current_algo.reset();
+      index_file = index.file;
     }
-    search_param = ann::create_search_param<T>(index.algo, sp_json);
-    //        search_param->metric_objective = metric_objective;
-  } catch (const std::exception& e) {
-    throw std::runtime_error("Failed to create an algo: " + std::string(e.what()));
-  }
-  algo->set_search_param(*search_param);
 
-  const auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
-  const T* query_set       = dataset->query_set(algo_property.query_memory_type);
-
-  // TODO: Have 1 thread create and load these.
-  buf<float> distances{algo_property.query_memory_type, k * query_set_size};
-  buf<std::size_t> neighbors{algo_property.query_memory_type, k * query_set_size};
-
-  if (search_param->needs_dataset()) {
+    std::unique_ptr<typename ANN<T>::AnnSearchParam> search_param;
+    ANN<T>* algo;
     try {
-      algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
-                               dataset->base_set_size());
-    } catch (const std::exception& ex) {
-      throw std::runtime_error("The algorithm '" + index.name +
-                               "' requires the base set, but it's not available. " +
-                               "Exception: " + std::string(ex.what()));
-      return;
+      if (!current_algo || (algo = dynamic_cast<ANN<T>*>(current_algo.get())) == nullptr) {
+        auto ualgo = ann::create_algo<T>(
+          index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list);
+        algo = ualgo.get();
+        algo->load(index_file);
+        current_algo = std::move(ualgo);
+      }
+      search_param                   = ann::create_search_param<T>(index.algo, sp_json);
+      search_param->metric_objective = metric_objective;
+    } catch (const std::exception& e) {
+      state.SkipWithError("Failed to create an algo: " + std::string(e.what()));
+    }
+    algo->set_search_param(*search_param);
+    auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
+    current_algo_props = std::make_shared<AlgoProperty>(algo_property.dataset_memory_type,
+                                                        algo_property.query_memory_type);
+    if (search_param->needs_dataset()) {
+      try {
+        algo->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type),
+                                 dataset->base_set_size());
+      } catch (const std::exception& ex) {
+        state.SkipWithError("The algorithm '" + index.name +
+                            "' requires the base set, but it's not available. " +
+                            "Exception: " + std::string(ex.what()));
+        return;
+      }
     }
   }
 
+  const auto algo_property = *current_algo_props;
+  const T* query_set       = dataset->query_set(algo_property.query_memory_type);
+
+  /**
+   * Each thread will manage its own outputs
+   */
+  std::shared_ptr<buf<float>> distances =
+    std::make_shared<buf<float>>(algo_property.query_memory_type, k * query_set_size);
+  std::shared_ptr<buf<std::size_t>> neighbors =
+    std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
+
   cuda_timer gpu_timer;
   {
-    /**
-     * When the objective is throughput, we want to overlap batches
-     * as much as possible and measure the end-to-end time from start
-     * to finish.
-     *
-     * When the objective is latency, we want to measure each batch
-     * individually. Latency is better measured in single-query batches
-     * but larger batches are still allowed in this mode in order to
-     * compare against the resulting batch sizes in throughput mode.
-     */
-
     nvtx_case nvtx{state.name()};
 
-    // Multithreading starts in the benchmark loop
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+
+      ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
+      auto start   = std::chrono::high_resolution_clock::now();
       // run the search
       try {
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors.data + batch_offset * k,
-                     distances.data + batch_offset * k,
+                     neighbors->data + batch_offset * k,
+                     distances->data + batch_offset * k,
                      gpu_timer.stream());
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
+
+      auto end             = std::chrono::high_resolution_clock::now();
+      auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
       // advance to the next batch
       batch_offset = (batch_offset + n_queries) % query_set_size;
       queries_processed += n_queries;
+      state.SetIterationTime(elapsed_seconds.count());
+      total_time += elapsed_seconds.count();
     }
   }
 
-  state.counters.insert({{"total_queries", queries_processed.load()}});
   state.SetItemsProcessed(queries_processed);
   if (cudart.found()) {
-    state.counters.insert({{"GPU Time", gpu_timer.total_time() / state.iterations()},
-                           {"GPU QPS", queries_processed.load() / gpu_timer.total_time()}});
-  }
-  if (state.skipped()) { return; }
+        state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())},
 
-  if (state.thread_index() == 0) {
-    state.counters.insert({{"k", k}, {"batch_size", n_queries}});
-    // evaluate recall
-    if (dataset->max_k() >= k) {
-      const std::int32_t* gt          = dataset->gt_set();
-      const std::uint32_t max_k       = dataset->max_k();
-      buf<std::size_t> neighbors_host = neighbors.move(MemoryType::Host);
-
-      std::size_t rows        = std::min(queries_processed.load(), query_set_size);
-      std::size_t match_count = 0;
-      std::size_t total_count = rows * static_cast<size_t>(k);
-      for (std::size_t i = 0; i < rows; i++) {
-        for (std::uint32_t j = 0; j < k; j++) {
-          auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
-          for (std::uint32_t l = 0; l < k; l++) {
-            auto exp_idx = gt[i * max_k + l];
-            if (act_idx == exp_idx) {
-              match_count++;
-              break;
+//                               // Using gpu_timer.total_time() isn't really the most fair comparison.
+//                               {"GPU QPS", queries_processed / total_time }});
+    }
+
+    // This will be the total number of queries across all threads
+    state.counters.insert({{"total_queries", queries_processed}});
+
+    if (state.skipped()) {
+      return; }
+
+    if (state.thread_index() == 0) {
+      state.counters.insert({{"k", k}, {"n_queries", n_queries}});
+
+      // evaluate recall
+      if (dataset->max_k() >= k) {
+        const std::int32_t* gt          = dataset->gt_set();
+        const std::uint32_t max_k       = dataset->max_k();
+        buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
+        std::size_t rows                = std::min(queries_processed, query_set_size);
+        std::size_t match_count         = 0;
+        std::size_t total_count         = rows * static_cast<size_t>(k);
+        for (std::size_t i = 0; i < rows; i++) {
+          for (std::uint32_t j = 0; j < k; j++) {
+            auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
+            for (std::uint32_t l = 0; l < k; l++) {
+              auto exp_idx = gt[i * max_k + l];
+              if (act_idx == exp_idx) {
+                match_count++;
+                break;
+              }
             }
           }
         }
+        double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+        state.counters.insert({{"Recall", actual_recall}});
       }
-      double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-      state.counters.insert({{"Recall", actual_recall}});
     }
   }
-}
 
-inline void printf_usage()
-{
-  ::benchmark::PrintDefaultHelp();
-  fprintf(
-    stdout,
-    "          [--build|--search] \n"
-    "          [--overwrite]\n"
-    "          [--data_prefix=<prefix>]\n"
-    "          [--index_prefix=<prefix>]\n"
-    "          [--override_kv=<key:value1:value2:...:valueN>]\n"
-    "          [--mode=<latency|throughput>\n"
-    "          <conf>.json\n"
-    "\n"
-    "Note the non-standard benchmark parameters:\n"
-    "  --build: build mode, will build index\n"
-    "  --search: search mode, will search using the built index\n"
-    "            one and only one of --build and --search should be specified\n"
-    "  --overwrite: force overwriting existing index files\n"
-    "  --data_prefix=<prefix>:"
-    " prepend <prefix> to dataset file paths specified in the <conf>.json (default = 'data/').\n"
-    "  --index_prefix=<prefix>:"
-    " prepend <prefix> to index file paths specified in the <conf>.json (default = 'index/').\n"
-    "  --override_kv=<key:value1:value2:...:valueN>:"
-    " override a build/search key one or more times multiplying the number of configurations;"
-    " you can use this parameter multiple times to get the Cartesian product of benchmark"
-    " configs.\n"
-    "  --mode=<latency|throughput>"
-    "     run the benchmarks in latency (accumulate times spent in each batch) or "
-    "     throughput (pipeline batches and measure end-to-end) mode\n");
-}
-
-template <typename T>
-void register_build(std::shared_ptr<const Dataset<T>> dataset,
-                    std::vector<Configuration::Index> indices,
-                    bool force_overwrite)
-{
-  for (auto index : indices) {
-    auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
-    auto file_suf = suf;
-    index.build_param.erase("override_suffix");
-    std::replace(file_suf.begin(), file_suf.end(), '/', '-');
-    index.file += file_suf;
-    auto* b = ::benchmark::RegisterBenchmark(
-      index.name + suf, bench_build<T>, dataset, index, force_overwrite);
-    b->Unit(benchmark::kSecond);
-    b->UseRealTime();
+  inline void printf_usage()
+  {
+        ::benchmark::PrintDefaultHelp();
+        fprintf(
+          stdout,
+          "          [--build|--search] \n"
+          "          [--overwrite]\n"
+          "          [--data_prefix=<prefix>]\n"
+          "          [--index_prefix=<prefix>]\n"
+          "          [--override_kv=<key:value1:value2:...:valueN>]\n"
+          "          [--mode=<latency|throughput>\n"
+          "          <conf>.json\n"
+          "\n"
+          "Note the non-standard benchmark parameters:\n"
+          "  --build: build mode, will build index\n"
+          "  --search: search mode, will search using the built index\n"
+          "            one and only one of --build and --search should be specified\n"
+          "  --overwrite: force overwriting existing index files\n"
+          "  --data_prefix=<prefix>:"
+          " prepend <prefix> to dataset file paths specified in the <conf>.json (default = "
+          "'data/').\n"
+          "  --index_prefix=<prefix>:"
+          " prepend <prefix> to index file paths specified in the <conf>.json (default = "
+          "'index/').\n"
+          "  --override_kv=<key:value1:value2:...:valueN>:"
+          " override a build/search key one or more times multiplying the number of configurations;"
+          " you can use this parameter multiple times to get the Cartesian product of benchmark"
+          " configs.\n"
+          "  --mode=<latency|throughput>"
+          "     run the benchmarks in latency (accumulate times spent in each batch) or "
+          "     throughput (pipeline batches and measure end-to-end) mode\n");
   }
-}
 
-template <typename T>
-void register_search(std::shared_ptr<const Dataset<T>> dataset,
-                     std::vector<Configuration::Index> indices,
-                     Objective metric_objective)
-{
-  for (auto index : indices) {
-    for (std::size_t i = 0; i < index.search_params.size(); i++) {
-      auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
-      index.search_params[i].erase("override_suffix");
-
-      auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search<T>, index, i, dataset)
-                  ->Unit(benchmark::kMillisecond)
-                  ->ThreadRange(1, 16)
-                  ->UseRealTime();
-      std::cout << "Done registering index " << i << std::endl;
-    }
+  template <typename T>
+  void register_build(std::shared_ptr<const Dataset<T>> dataset,
+                      std::vector<Configuration::Index> indices,
+                      bool force_overwrite)
+  {
+        for (auto index : indices) {
+          auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
+          auto file_suf = suf;
+          index.build_param.erase("override_suffix");
+          std::replace(file_suf.begin(), file_suf.end(), '/', '-');
+          index.file += file_suf;
+          auto* b = ::benchmark::RegisterBenchmark(
+            index.name + suf, bench_build<T>, dataset, index, force_overwrite);
+          b->Unit(benchmark::kSecond);
+          b->UseRealTime();
+        }
   }
-}
 
-template <typename T>
-void dispatch_benchmark(const Configuration& conf,
-                        bool force_overwrite,
-                        bool build_mode,
-                        bool search_mode,
-                        std::string data_prefix,
-                        std::string index_prefix,
-                        kv_series override_kv,
-                        Objective metric_objective)
-{
-  if (cudart.found()) {
-    for (auto [key, value] : cuda_info()) {
-      ::benchmark::AddCustomContext(key, value);
-    }
-  }
-  const auto dataset_conf = conf.get_dataset_conf();
-  auto base_file          = combine_path(data_prefix, dataset_conf.base_file);
-  auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
-  auto gt_file            = dataset_conf.groundtruth_neighbors_file;
-  if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
-  auto dataset = std::make_shared<BinDataset<T>>(dataset_conf.name,
-                                                 base_file,
-                                                 dataset_conf.subset_first_row,
-                                                 dataset_conf.subset_size,
-                                                 query_file,
-                                                 dataset_conf.distance,
-                                                 gt_file);
-  ::benchmark::AddCustomContext("dataset", dataset_conf.name);
-  ::benchmark::AddCustomContext("distance", dataset_conf.distance);
-  std::vector<Configuration::Index> indices = conf.get_indices();
-  if (build_mode) {
-    if (file_exists(base_file)) {
-      log_info("Using the dataset file '%s'", base_file.c_str());
-      ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size()));
-      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
-    } else {
-      log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.",
-               base_file.c_str());
-    }
-    std::vector<Configuration::Index> more_indices{};
-    for (auto& index : indices) {
-      for (auto param : apply_overrides(index.build_param, override_kv)) {
-        auto modified_index        = index;
-        modified_index.build_param = param;
-        modified_index.file        = combine_path(index_prefix, modified_index.file);
-        more_indices.push_back(modified_index);
-      }
-    }
-    register_build<T>(dataset, more_indices, force_overwrite);
-  } else if (search_mode) {
-    if (file_exists(query_file)) {
-      log_info("Using the query file '%s'", query_file.c_str());
-      ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size()));
-      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
-      if (gt_file.has_value()) {
-        if (file_exists(*gt_file)) {
-          log_info("Using the ground truth file '%s'", gt_file->c_str());
-          ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k()));
-        } else {
-          log_warn("Ground truth file '%s' does not exist; the recall won't be reported.",
-                   gt_file->c_str());
+  template <typename T>
+  void register_search(std::shared_ptr<const Dataset<T>> dataset,
+                       std::vector<Configuration::Index> indices,
+                       Objective metric_objective)
+  {
+        for (auto index : indices) {
+          for (std::size_t i = 0; i < index.search_params.size(); i++) {
+            auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
+            index.search_params[i].erase("override_suffix");
+
+            auto* b = ::benchmark::RegisterBenchmark(
+                        index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
+                        ->Unit(benchmark::kMillisecond)
+                        ->ThreadRange(1, 32)
+                        ->UseManualTime();
+          }
         }
-      } else {
-        log_warn(
-          "Ground truth file is not provided; the recall won't be reported. NB: use "
-          "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the path to "
-          "the ground truth in your conf.json.");
-      }
-    } else {
-      log_warn("Query file '%s' does not exist; benchmarking search is impossible.",
-               query_file.c_str());
-    }
-    for (auto& index : indices) {
-      index.search_params = apply_overrides(index.search_params, override_kv);
-      index.file          = combine_path(index_prefix, index.file);
-    }
-    register_search<T>(dataset, indices, metric_objective);
   }
-}
 
-inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool
-{
-  if (strcmp(arg, pat) == 0) {
-    result = true;
-    return true;
+  template <typename T>
+  void dispatch_benchmark(const Configuration& conf,
+                          bool force_overwrite,
+                          bool build_mode,
+                          bool search_mode,
+                          std::string data_prefix,
+                          std::string index_prefix,
+                          kv_series override_kv,
+                          Objective metric_objective)
+  {
+        if (cudart.found()) {
+          for (auto [key, value] : cuda_info()) {
+            ::benchmark::AddCustomContext(key, value);
+          }
+        }
+        const auto dataset_conf = conf.get_dataset_conf();
+        auto base_file          = combine_path(data_prefix, dataset_conf.base_file);
+        auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
+        auto gt_file            = dataset_conf.groundtruth_neighbors_file;
+        if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
+        auto dataset = std::make_shared<BinDataset<T>>(dataset_conf.name,
+                                                       base_file,
+                                                       dataset_conf.subset_first_row,
+                                                       dataset_conf.subset_size,
+                                                       query_file,
+                                                       dataset_conf.distance,
+                                                       gt_file);
+        ::benchmark::AddCustomContext("dataset", dataset_conf.name);
+        ::benchmark::AddCustomContext("distance", dataset_conf.distance);
+        std::vector<Configuration::Index> indices = conf.get_indices();
+        if (build_mode) {
+          if (file_exists(base_file)) {
+            log_info("Using the dataset file '%s'", base_file.c_str());
+            ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size()));
+            ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+          } else {
+            log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.",
+                     base_file.c_str());
+          }
+          std::vector<Configuration::Index> more_indices{};
+          for (auto& index : indices) {
+            for (auto param : apply_overrides(index.build_param, override_kv)) {
+              auto modified_index        = index;
+              modified_index.build_param = param;
+              modified_index.file        = combine_path(index_prefix, modified_index.file);
+              more_indices.push_back(modified_index);
+            }
+          }
+          register_build<T>(dataset, more_indices, force_overwrite);
+        } else if (search_mode) {
+          if (file_exists(query_file)) {
+            log_info("Using the query file '%s'", query_file.c_str());
+            ::benchmark::AddCustomContext("max_n_queries",
+                                          std::to_string(dataset->query_set_size()));
+            ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+            if (gt_file.has_value()) {
+              if (file_exists(*gt_file)) {
+                log_info("Using the ground truth file '%s'", gt_file->c_str());
+                ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k()));
+              } else {
+                log_warn("Ground truth file '%s' does not exist; the recall won't be reported.",
+                         gt_file->c_str());
+              }
+            } else {
+              log_warn(
+                "Ground truth file is not provided; the recall won't be reported. NB: use "
+                "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the "
+                "path to "
+                "the ground truth in your conf.json.");
+            }
+          } else {
+            log_warn("Query file '%s' does not exist; benchmarking search is impossible.",
+                     query_file.c_str());
+          }
+          for (auto& index : indices) {
+            index.search_params = apply_overrides(index.search_params, override_kv);
+            index.file          = combine_path(index_prefix, index.file);
+          }
+          register_search<T>(dataset, indices, metric_objective);
+        }
   }
-  return false;
-}
 
-inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool
-{
-  auto n = strlen(pat);
-  if (strncmp(pat, arg, strlen(pat)) == 0) {
-    result = arg + n + 1;
-    return true;
+  inline auto parse_bool_flag(const char* arg, const char* pat, bool& result)->bool
+  {
+        if (strcmp(arg, pat) == 0) {
+          result = true;
+          return true;
+        }
+        return false;
   }
-  return false;
-}
 
-inline auto run_main(int argc, char** argv) -> int
-{
-  bool force_overwrite        = false;
-  bool build_mode             = false;
-  bool search_mode            = false;
-  std::string data_prefix     = "data";
-  std::string index_prefix    = "index";
-  std::string new_override_kv = "";
-  std::string mode            = "latency";
-  kv_series override_kv{};
-
-  char arg0_default[] = "benchmark";  // NOLINT
-  char* args_default  = arg0_default;
-  if (!argv) {
-    argc = 1;
-    argv = &args_default;
-  }
-  if (argc == 1) {
-    printf_usage();
-    return -1;
+  inline auto parse_string_flag(const char* arg, const char* pat, std::string& result)->bool
+  {
+        auto n = strlen(pat);
+        if (strncmp(pat, arg, strlen(pat)) == 0) {
+          result = arg + n + 1;
+          return true;
+        }
+        return false;
   }
 
-  char* conf_path = argv[--argc];
-  std::ifstream conf_stream(conf_path);
-
-  for (int i = 1; i < argc; i++) {
-    if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) ||
-        parse_bool_flag(argv[i], "--build", build_mode) ||
-        parse_bool_flag(argv[i], "--search", search_mode) ||
-        parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
-        parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
-        parse_string_flag(argv[i], "--mode", mode) ||
-        parse_string_flag(argv[i], "--override_kv", new_override_kv)) {
-      if (!new_override_kv.empty()) {
-        auto kvv = split(new_override_kv, ':');
-        auto key = kvv[0];
-        std::vector<nlohmann::json> vals{};
-        for (std::size_t j = 1; j < kvv.size(); j++) {
-          vals.push_back(nlohmann::json::parse(kvv[j]));
+  inline auto run_main(int argc, char** argv)->int
+  {
+        bool force_overwrite        = false;
+        bool build_mode             = false;
+        bool search_mode            = false;
+        std::string data_prefix     = "data";
+        std::string index_prefix    = "index";
+        std::string new_override_kv = "";
+        std::string mode            = "latency";
+        kv_series override_kv{};
+
+        char arg0_default[] = "benchmark";  // NOLINT
+        char* args_default  = arg0_default;
+        if (!argv) {
+          argc = 1;
+          argv = &args_default;
+        }
+        if (argc == 1) {
+          printf_usage();
+          return -1;
         }
-        override_kv.emplace_back(key, vals);
-        new_override_kv = "";
-      }
-      for (int j = i; j < argc - 1; j++) {
-        argv[j] = argv[j + 1];
-      }
-      argc--;
-      i--;
-    }
-  }
 
-  Objective metric_objective = Objective::LATENCY;
-  if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
+        char* conf_path = argv[--argc];
+        std::ifstream conf_stream(conf_path);
+
+        for (int i = 1; i < argc; i++) {
+          if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) ||
+              parse_bool_flag(argv[i], "--build", build_mode) ||
+              parse_bool_flag(argv[i], "--search", search_mode) ||
+              parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
+              parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
+              parse_string_flag(argv[i], "--mode", mode) ||
+              parse_string_flag(argv[i], "--override_kv", new_override_kv)) {
+            if (!new_override_kv.empty()) {
+              auto kvv = split(new_override_kv, ':');
+              auto key = kvv[0];
+              std::vector<nlohmann::json> vals{};
+              for (std::size_t j = 1; j < kvv.size(); j++) {
+                vals.push_back(nlohmann::json::parse(kvv[j]));
+              }
+              override_kv.emplace_back(key, vals);
+              new_override_kv = "";
+            }
+            for (int j = i; j < argc - 1; j++) {
+              argv[j] = argv[j + 1];
+            }
+            argc--;
+            i--;
+          }
+        }
 
-  if (build_mode == search_mode) {
-    log_error("One and only one of --build and --search should be specified");
-    printf_usage();
-    return -1;
-  }
+        Objective metric_objective = Objective::LATENCY;
+        if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
 
-  if (!conf_stream) {
-    log_error("Can't open configuration file: %s", conf_path);
-    return -1;
-  }
+        if (build_mode == search_mode) {
+          log_error("One and only one of --build and --search should be specified");
+          printf_usage();
+          return -1;
+        }
 
-  if (cudart.needed() && !cudart.found()) {
-    log_warn("cudart library is not found, GPU-based indices won't work.");
-  }
+        if (!conf_stream) {
+          log_error("Can't open configuration file: %s", conf_path);
+          return -1;
+        }
+
+        if (cudart.needed() && !cudart.found()) {
+          log_warn("cudart library is not found, GPU-based indices won't work.");
+        }
 
-  Configuration conf(conf_stream);
-  std::string dtype = conf.get_dataset_conf().dtype;
-
-  if (dtype == "float") {
-    dispatch_benchmark<float>(conf,
-                              force_overwrite,
-                              build_mode,
-                              search_mode,
-                              data_prefix,
-                              index_prefix,
-                              override_kv,
-                              metric_objective);
-  } else if (dtype == "uint8") {
-    dispatch_benchmark<std::uint8_t>(conf,
-                                     force_overwrite,
-                                     build_mode,
-                                     search_mode,
-                                     data_prefix,
-                                     index_prefix,
-                                     override_kv,
-                                     metric_objective);
-  } else if (dtype == "int8") {
-    dispatch_benchmark<std::int8_t>(conf,
+        Configuration conf(conf_stream);
+        std::string dtype = conf.get_dataset_conf().dtype;
+
+        if (dtype == "float") {
+          dispatch_benchmark<float>(conf,
                                     force_overwrite,
                                     build_mode,
                                     search_mode,
@@ -572,19 +577,37 @@ inline auto run_main(int argc, char** argv) -> int
                                     index_prefix,
                                     override_kv,
                                     metric_objective);
-  } else {
-    log_error("datatype '%s' is not supported", dtype.c_str());
-    return -1;
-  }
+        } else if (dtype == "uint8") {
+          dispatch_benchmark<std::uint8_t>(conf,
+                                           force_overwrite,
+                                           build_mode,
+                                           search_mode,
+                                           data_prefix,
+                                           index_prefix,
+                                           override_kv,
+                                           metric_objective);
+        } else if (dtype == "int8") {
+          dispatch_benchmark<std::int8_t>(conf,
+                                          force_overwrite,
+                                          build_mode,
+                                          search_mode,
+                                          data_prefix,
+                                          index_prefix,
+                                          override_kv,
+                                          metric_objective);
+        } else {
+          log_error("datatype '%s' is not supported", dtype.c_str());
+          return -1;
+        }
 
-  ::benchmark::Initialize(&argc, argv, printf_usage);
-  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1;
-  ::benchmark::RunSpecifiedBenchmarks();
-  ::benchmark::Shutdown();
-  // Release a possibly cached ANN object, so that it cannot be alive longer than the handle to a
-  // shared library it depends on (dynamic benchmark executable).
-  current_algo.reset();
-  return 0;
-}
+        ::benchmark::Initialize(&argc, argv, printf_usage);
+        if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1;
+        ::benchmark::RunSpecifiedBenchmarks();
+        ::benchmark::Shutdown();
+        // Release a possibly cached ANN object, so that it cannot be alive longer than the handle
+        // to a shared library it depends on (dynamic benchmark executable).
+        current_algo.reset();
+        return 0;
+  }
 
 };  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 81c50f2c2e..66776ee3bc 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -65,7 +65,7 @@ class HnswLib : public ANN<T> {
   using typename ANN<T>::AnnSearchParam;
   struct SearchParam : public AnnSearchParam {
     int ef;
-    int num_threads = omp_get_num_procs();
+    int num_threads = 1;
   };
 
   HnswLib(Metric metric, int dim, const BuildParam& param);

From 7c80266c9c5483a0fe64a0ac3ce2c9e96b3a9896 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 00:16:25 -0400
Subject: [PATCH 06/19] Fixing syntax error

---
 cpp/bench/ann/src/common/benchmark.hpp | 520 ++++++++++++-------------
 1 file changed, 257 insertions(+), 263 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 5c755108af..bd987d4f23 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -284,52 +284,47 @@ void bench_search(::benchmark::State& state,
 
   state.SetItemsProcessed(queries_processed);
   if (cudart.found()) {
-        state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())},
+    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
+  }
 
-//                               // Using gpu_timer.total_time() isn't really the most fair comparison.
-//                               {"GPU QPS", queries_processed / total_time }});
-    }
+  // This will be the total number of queries across all threads
+  state.counters.insert({{"total_queries", queries_processed}});
 
-    // This will be the total number of queries across all threads
-    state.counters.insert({{"total_queries", queries_processed}});
-
-    if (state.skipped()) {
-      return; }
-
-    if (state.thread_index() == 0) {
-      state.counters.insert({{"k", k}, {"n_queries", n_queries}});
-
-      // evaluate recall
-      if (dataset->max_k() >= k) {
-        const std::int32_t* gt          = dataset->gt_set();
-        const std::uint32_t max_k       = dataset->max_k();
-        buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
-        std::size_t rows                = std::min(queries_processed, query_set_size);
-        std::size_t match_count         = 0;
-        std::size_t total_count         = rows * static_cast<size_t>(k);
-        for (std::size_t i = 0; i < rows; i++) {
-          for (std::uint32_t j = 0; j < k; j++) {
-            auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
-            for (std::uint32_t l = 0; l < k; l++) {
-              auto exp_idx = gt[i * max_k + l];
-              if (act_idx == exp_idx) {
-                match_count++;
-                break;
-              }
+  if (state.skipped()) { return; }
+
+  if (state.thread_index() == 0) {
+    state.counters.insert({{"k", k}, {"n_queries", n_queries}});
+
+    // evaluate recall
+    if (dataset->max_k() >= k) {
+      const std::int32_t* gt          = dataset->gt_set();
+      const std::uint32_t max_k       = dataset->max_k();
+      buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
+      std::size_t rows                = std::min(queries_processed, query_set_size);
+      std::size_t match_count         = 0;
+      std::size_t total_count         = rows * static_cast<size_t>(k);
+      for (std::size_t i = 0; i < rows; i++) {
+        for (std::uint32_t j = 0; j < k; j++) {
+          auto act_idx = std::int32_t(neighbors_host.data[i * k + j]);
+          for (std::uint32_t l = 0; l < k; l++) {
+            auto exp_idx = gt[i * max_k + l];
+            if (act_idx == exp_idx) {
+              match_count++;
+              break;
             }
           }
         }
-        double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-        state.counters.insert({{"Recall", actual_recall}});
       }
+      double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+      state.counters.insert({{"Recall", actual_recall}});
     }
   }
+}
 
-  inline void printf_usage()
-  {
-        ::benchmark::PrintDefaultHelp();
-        fprintf(
-          stdout,
+inline void printf_usage()
+{
+  ::benchmark::PrintDefaultHelp();
+  fprintf(stdout,
           "          [--build|--search] \n"
           "          [--overwrite]\n"
           "          [--data_prefix=<prefix>]\n"
@@ -356,220 +351,237 @@ void bench_search(::benchmark::State& state,
           "  --mode=<latency|throughput>"
           "     run the benchmarks in latency (accumulate times spent in each batch) or "
           "     throughput (pipeline batches and measure end-to-end) mode\n");
-  }
+}
 
-  template <typename T>
-  void register_build(std::shared_ptr<const Dataset<T>> dataset,
-                      std::vector<Configuration::Index> indices,
-                      bool force_overwrite)
-  {
-        for (auto index : indices) {
-          auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
-          auto file_suf = suf;
-          index.build_param.erase("override_suffix");
-          std::replace(file_suf.begin(), file_suf.end(), '/', '-');
-          index.file += file_suf;
-          auto* b = ::benchmark::RegisterBenchmark(
-            index.name + suf, bench_build<T>, dataset, index, force_overwrite);
-          b->Unit(benchmark::kSecond);
-          b->UseRealTime();
-        }
+template <typename T>
+void register_build(std::shared_ptr<const Dataset<T>> dataset,
+                    std::vector<Configuration::Index> indices,
+                    bool force_overwrite)
+{
+  for (auto index : indices) {
+    auto suf      = static_cast<std::string>(index.build_param["override_suffix"]);
+    auto file_suf = suf;
+    index.build_param.erase("override_suffix");
+    std::replace(file_suf.begin(), file_suf.end(), '/', '-');
+    index.file += file_suf;
+    auto* b = ::benchmark::RegisterBenchmark(
+      index.name + suf, bench_build<T>, dataset, index, force_overwrite);
+    b->Unit(benchmark::kSecond);
+    b->UseRealTime();
   }
+}
 
-  template <typename T>
-  void register_search(std::shared_ptr<const Dataset<T>> dataset,
-                       std::vector<Configuration::Index> indices,
-                       Objective metric_objective)
-  {
-        for (auto index : indices) {
-          for (std::size_t i = 0; i < index.search_params.size(); i++) {
-            auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
-            index.search_params[i].erase("override_suffix");
-
-            auto* b = ::benchmark::RegisterBenchmark(
-                        index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
-                        ->Unit(benchmark::kMillisecond)
-                        ->ThreadRange(1, 32)
-                        ->UseManualTime();
-          }
-        }
+template <typename T>
+void register_search(std::shared_ptr<const Dataset<T>> dataset,
+                     std::vector<Configuration::Index> indices,
+                     Objective metric_objective)
+{
+  for (auto index : indices) {
+    for (std::size_t i = 0; i < index.search_params.size(); i++) {
+      auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
+      index.search_params[i].erase("override_suffix");
+
+      auto* b = ::benchmark::RegisterBenchmark(
+                  index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
+                  ->Unit(benchmark::kMillisecond)
+                  ->ThreadRange(1, 32)
+                  ->UseManualTime();
+    }
   }
+}
 
-  template <typename T>
-  void dispatch_benchmark(const Configuration& conf,
-                          bool force_overwrite,
-                          bool build_mode,
-                          bool search_mode,
-                          std::string data_prefix,
-                          std::string index_prefix,
-                          kv_series override_kv,
-                          Objective metric_objective)
-  {
-        if (cudart.found()) {
-          for (auto [key, value] : cuda_info()) {
-            ::benchmark::AddCustomContext(key, value);
-          }
-        }
-        const auto dataset_conf = conf.get_dataset_conf();
-        auto base_file          = combine_path(data_prefix, dataset_conf.base_file);
-        auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
-        auto gt_file            = dataset_conf.groundtruth_neighbors_file;
-        if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
-        auto dataset = std::make_shared<BinDataset<T>>(dataset_conf.name,
-                                                       base_file,
-                                                       dataset_conf.subset_first_row,
-                                                       dataset_conf.subset_size,
-                                                       query_file,
-                                                       dataset_conf.distance,
-                                                       gt_file);
-        ::benchmark::AddCustomContext("dataset", dataset_conf.name);
-        ::benchmark::AddCustomContext("distance", dataset_conf.distance);
-        std::vector<Configuration::Index> indices = conf.get_indices();
-        if (build_mode) {
-          if (file_exists(base_file)) {
-            log_info("Using the dataset file '%s'", base_file.c_str());
-            ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size()));
-            ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
-          } else {
-            log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.",
-                     base_file.c_str());
-          }
-          std::vector<Configuration::Index> more_indices{};
-          for (auto& index : indices) {
-            for (auto param : apply_overrides(index.build_param, override_kv)) {
-              auto modified_index        = index;
-              modified_index.build_param = param;
-              modified_index.file        = combine_path(index_prefix, modified_index.file);
-              more_indices.push_back(modified_index);
-            }
-          }
-          register_build<T>(dataset, more_indices, force_overwrite);
-        } else if (search_mode) {
-          if (file_exists(query_file)) {
-            log_info("Using the query file '%s'", query_file.c_str());
-            ::benchmark::AddCustomContext("max_n_queries",
-                                          std::to_string(dataset->query_set_size()));
-            ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
-            if (gt_file.has_value()) {
-              if (file_exists(*gt_file)) {
-                log_info("Using the ground truth file '%s'", gt_file->c_str());
-                ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k()));
-              } else {
-                log_warn("Ground truth file '%s' does not exist; the recall won't be reported.",
-                         gt_file->c_str());
-              }
-            } else {
-              log_warn(
-                "Ground truth file is not provided; the recall won't be reported. NB: use "
-                "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the "
-                "path to "
-                "the ground truth in your conf.json.");
-            }
-          } else {
-            log_warn("Query file '%s' does not exist; benchmarking search is impossible.",
-                     query_file.c_str());
-          }
-          for (auto& index : indices) {
-            index.search_params = apply_overrides(index.search_params, override_kv);
-            index.file          = combine_path(index_prefix, index.file);
-          }
-          register_search<T>(dataset, indices, metric_objective);
+template <typename T>
+void dispatch_benchmark(const Configuration& conf,
+                        bool force_overwrite,
+                        bool build_mode,
+                        bool search_mode,
+                        std::string data_prefix,
+                        std::string index_prefix,
+                        kv_series override_kv,
+                        Objective metric_objective)
+{
+  if (cudart.found()) {
+    for (auto [key, value] : cuda_info()) {
+      ::benchmark::AddCustomContext(key, value);
+    }
+  }
+  const auto dataset_conf = conf.get_dataset_conf();
+  auto base_file          = combine_path(data_prefix, dataset_conf.base_file);
+  auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
+  auto gt_file            = dataset_conf.groundtruth_neighbors_file;
+  if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
+  auto dataset = std::make_shared<BinDataset<T>>(dataset_conf.name,
+                                                 base_file,
+                                                 dataset_conf.subset_first_row,
+                                                 dataset_conf.subset_size,
+                                                 query_file,
+                                                 dataset_conf.distance,
+                                                 gt_file);
+  ::benchmark::AddCustomContext("dataset", dataset_conf.name);
+  ::benchmark::AddCustomContext("distance", dataset_conf.distance);
+  std::vector<Configuration::Index> indices = conf.get_indices();
+  if (build_mode) {
+    if (file_exists(base_file)) {
+      log_info("Using the dataset file '%s'", base_file.c_str());
+      ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size()));
+      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+    } else {
+      log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.",
+               base_file.c_str());
+    }
+    std::vector<Configuration::Index> more_indices{};
+    for (auto& index : indices) {
+      for (auto param : apply_overrides(index.build_param, override_kv)) {
+        auto modified_index        = index;
+        modified_index.build_param = param;
+        modified_index.file        = combine_path(index_prefix, modified_index.file);
+        more_indices.push_back(modified_index);
+      }
+    }
+    register_build<T>(dataset, more_indices, force_overwrite);
+  } else if (search_mode) {
+    if (file_exists(query_file)) {
+      log_info("Using the query file '%s'", query_file.c_str());
+      ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size()));
+      ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim()));
+      if (gt_file.has_value()) {
+        if (file_exists(*gt_file)) {
+          log_info("Using the ground truth file '%s'", gt_file->c_str());
+          ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k()));
+        } else {
+          log_warn("Ground truth file '%s' does not exist; the recall won't be reported.",
+                   gt_file->c_str());
         }
+      } else {
+        log_warn(
+          "Ground truth file is not provided; the recall won't be reported. NB: use "
+          "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the "
+          "path to "
+          "the ground truth in your conf.json.");
+      }
+    } else {
+      log_warn("Query file '%s' does not exist; benchmarking search is impossible.",
+               query_file.c_str());
+    }
+    for (auto& index : indices) {
+      index.search_params = apply_overrides(index.search_params, override_kv);
+      index.file          = combine_path(index_prefix, index.file);
+    }
+    register_search<T>(dataset, indices, metric_objective);
   }
+}
 
-  inline auto parse_bool_flag(const char* arg, const char* pat, bool& result)->bool
-  {
-        if (strcmp(arg, pat) == 0) {
-          result = true;
-          return true;
-        }
-        return false;
+inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool
+{
+  if (strcmp(arg, pat) == 0) {
+    result = true;
+    return true;
   }
+  return false;
+}
 
-  inline auto parse_string_flag(const char* arg, const char* pat, std::string& result)->bool
-  {
-        auto n = strlen(pat);
-        if (strncmp(pat, arg, strlen(pat)) == 0) {
-          result = arg + n + 1;
-          return true;
-        }
-        return false;
+inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool
+{
+  auto n = strlen(pat);
+  if (strncmp(pat, arg, strlen(pat)) == 0) {
+    result = arg + n + 1;
+    return true;
   }
+  return false;
+}
 
-  inline auto run_main(int argc, char** argv)->int
-  {
-        bool force_overwrite        = false;
-        bool build_mode             = false;
-        bool search_mode            = false;
-        std::string data_prefix     = "data";
-        std::string index_prefix    = "index";
-        std::string new_override_kv = "";
-        std::string mode            = "latency";
-        kv_series override_kv{};
-
-        char arg0_default[] = "benchmark";  // NOLINT
-        char* args_default  = arg0_default;
-        if (!argv) {
-          argc = 1;
-          argv = &args_default;
-        }
-        if (argc == 1) {
-          printf_usage();
-          return -1;
-        }
+inline auto run_main(int argc, char** argv) -> int
+{
+  bool force_overwrite        = false;
+  bool build_mode             = false;
+  bool search_mode            = false;
+  std::string data_prefix     = "data";
+  std::string index_prefix    = "index";
+  std::string new_override_kv = "";
+  std::string mode            = "latency";
+  kv_series override_kv{};
+
+  char arg0_default[] = "benchmark";  // NOLINT
+  char* args_default  = arg0_default;
+  if (!argv) {
+    argc = 1;
+    argv = &args_default;
+  }
+  if (argc == 1) {
+    printf_usage();
+    return -1;
+  }
 
-        char* conf_path = argv[--argc];
-        std::ifstream conf_stream(conf_path);
-
-        for (int i = 1; i < argc; i++) {
-          if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) ||
-              parse_bool_flag(argv[i], "--build", build_mode) ||
-              parse_bool_flag(argv[i], "--search", search_mode) ||
-              parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
-              parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
-              parse_string_flag(argv[i], "--mode", mode) ||
-              parse_string_flag(argv[i], "--override_kv", new_override_kv)) {
-            if (!new_override_kv.empty()) {
-              auto kvv = split(new_override_kv, ':');
-              auto key = kvv[0];
-              std::vector<nlohmann::json> vals{};
-              for (std::size_t j = 1; j < kvv.size(); j++) {
-                vals.push_back(nlohmann::json::parse(kvv[j]));
-              }
-              override_kv.emplace_back(key, vals);
-              new_override_kv = "";
-            }
-            for (int j = i; j < argc - 1; j++) {
-              argv[j] = argv[j + 1];
-            }
-            argc--;
-            i--;
-          }
+  char* conf_path = argv[--argc];
+  std::ifstream conf_stream(conf_path);
+
+  for (int i = 1; i < argc; i++) {
+    if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) ||
+        parse_bool_flag(argv[i], "--build", build_mode) ||
+        parse_bool_flag(argv[i], "--search", search_mode) ||
+        parse_string_flag(argv[i], "--data_prefix", data_prefix) ||
+        parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
+        parse_string_flag(argv[i], "--mode", mode) ||
+        parse_string_flag(argv[i], "--override_kv", new_override_kv)) {
+      if (!new_override_kv.empty()) {
+        auto kvv = split(new_override_kv, ':');
+        auto key = kvv[0];
+        std::vector<nlohmann::json> vals{};
+        for (std::size_t j = 1; j < kvv.size(); j++) {
+          vals.push_back(nlohmann::json::parse(kvv[j]));
         }
+        override_kv.emplace_back(key, vals);
+        new_override_kv = "";
+      }
+      for (int j = i; j < argc - 1; j++) {
+        argv[j] = argv[j + 1];
+      }
+      argc--;
+      i--;
+    }
+  }
 
-        Objective metric_objective = Objective::LATENCY;
-        if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
-
-        if (build_mode == search_mode) {
-          log_error("One and only one of --build and --search should be specified");
-          printf_usage();
-          return -1;
-        }
+  Objective metric_objective = Objective::LATENCY;
+  if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; }
 
-        if (!conf_stream) {
-          log_error("Can't open configuration file: %s", conf_path);
-          return -1;
-        }
+  if (build_mode == search_mode) {
+    log_error("One and only one of --build and --search should be specified");
+    printf_usage();
+    return -1;
+  }
 
-        if (cudart.needed() && !cudart.found()) {
-          log_warn("cudart library is not found, GPU-based indices won't work.");
-        }
+  if (!conf_stream) {
+    log_error("Can't open configuration file: %s", conf_path);
+    return -1;
+  }
 
-        Configuration conf(conf_stream);
-        std::string dtype = conf.get_dataset_conf().dtype;
+  if (cudart.needed() && !cudart.found()) {
+    log_warn("cudart library is not found, GPU-based indices won't work.");
+  }
 
-        if (dtype == "float") {
-          dispatch_benchmark<float>(conf,
+  Configuration conf(conf_stream);
+  std::string dtype = conf.get_dataset_conf().dtype;
+
+  if (dtype == "float") {
+    dispatch_benchmark<float>(conf,
+                              force_overwrite,
+                              build_mode,
+                              search_mode,
+                              data_prefix,
+                              index_prefix,
+                              override_kv,
+                              metric_objective);
+  } else if (dtype == "uint8") {
+    dispatch_benchmark<std::uint8_t>(conf,
+                                     force_overwrite,
+                                     build_mode,
+                                     search_mode,
+                                     data_prefix,
+                                     index_prefix,
+                                     override_kv,
+                                     metric_objective);
+  } else if (dtype == "int8") {
+    dispatch_benchmark<std::int8_t>(conf,
                                     force_overwrite,
                                     build_mode,
                                     search_mode,
@@ -577,37 +589,19 @@ void bench_search(::benchmark::State& state,
                                     index_prefix,
                                     override_kv,
                                     metric_objective);
-        } else if (dtype == "uint8") {
-          dispatch_benchmark<std::uint8_t>(conf,
-                                           force_overwrite,
-                                           build_mode,
-                                           search_mode,
-                                           data_prefix,
-                                           index_prefix,
-                                           override_kv,
-                                           metric_objective);
-        } else if (dtype == "int8") {
-          dispatch_benchmark<std::int8_t>(conf,
-                                          force_overwrite,
-                                          build_mode,
-                                          search_mode,
-                                          data_prefix,
-                                          index_prefix,
-                                          override_kv,
-                                          metric_objective);
-        } else {
-          log_error("datatype '%s' is not supported", dtype.c_str());
-          return -1;
-        }
-
-        ::benchmark::Initialize(&argc, argv, printf_usage);
-        if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1;
-        ::benchmark::RunSpecifiedBenchmarks();
-        ::benchmark::Shutdown();
-        // Release a possibly cached ANN object, so that it cannot be alive longer than the handle
-        // to a shared library it depends on (dynamic benchmark executable).
-        current_algo.reset();
-        return 0;
+  } else {
+    log_error("datatype '%s' is not supported", dtype.c_str());
+    return -1;
   }
 
+  ::benchmark::Initialize(&argc, argv, printf_usage);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1;
+  ::benchmark::RunSpecifiedBenchmarks();
+  ::benchmark::Shutdown();
+  // Release a possibly cached ANN object, so that it cannot be alive longer than the handle
+  // to a shared library it depends on (dynamic benchmark executable).
+  current_algo.reset();
+  return 0;
+}
+
 };  // namespace raft::bench::ann

From adf2f44653bc3ea9e156956fa792353b89bdb6ac Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 01:02:58 -0400
Subject: [PATCH 07/19] Minor things

---
 cpp/bench/ann/src/common/benchmark.hpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index bd987d4f23..9a58a5f6eb 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -254,12 +254,13 @@ void bench_search(::benchmark::State& state,
   {
     nvtx_case nvtx{state.name()};
 
+    // TODO: Have the odd threads load the queries backwards just to rule out caching.
+    ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
 
-      ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
-      auto start   = std::chrono::high_resolution_clock::now();
+      auto start = std::chrono::high_resolution_clock::now();
       // run the search
       try {
         algo->search(query_set + batch_offset * dataset->dim(),
@@ -272,7 +273,8 @@ void bench_search(::benchmark::State& state,
         state.SkipWithError(std::string(e.what()));
       }
 
-      auto end             = std::chrono::high_resolution_clock::now();
+      auto end = std::chrono::high_resolution_clock::now();
+
       auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
       // advance to the next batch
       batch_offset = (batch_offset + n_queries) % query_set_size;
@@ -292,7 +294,8 @@ void bench_search(::benchmark::State& state,
 
   if (state.skipped()) { return; }
 
-  if (state.thread_index() == 0) {
+  // Use the last thread as a sanity check that all the threads are working.
+  if (state.thread_index() == state.threads() - 1) {
     state.counters.insert({{"k", k}, {"n_queries", n_queries}});
 
     // evaluate recall

From 89786ff02f22ba26bb583d93e3479f51cfe488c2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 09:38:30 -0400
Subject: [PATCH 08/19] Removing cudart_utils from benchmark.hpp

---
 cpp/bench/ann/src/common/benchmark.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 9a58a5f6eb..4059e723bf 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -20,8 +20,6 @@
 #include "dataset.hpp"
 #include "util.hpp"
 
-#include <raft/util/cudart_utils.hpp>
-
 #include <benchmark/benchmark.h>
 
 #include <rmm/cuda_stream_pool.hpp>

From 4573050c3b087b7584e936fced154ade64aaa4d4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 11:39:33 -0400
Subject: [PATCH 09/19] Removing all cuda includes from benchmark.hpp

---
 cpp/bench/ann/src/common/benchmark.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 4059e723bf..a83adffdef 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -22,9 +22,6 @@
 
 #include <benchmark/benchmark.h>
 
-#include <rmm/cuda_stream_pool.hpp>
-
-#include "thread_pool.hpp"
 #include <algorithm>
 #include <chrono>
 #include <cmath>
@@ -382,11 +379,14 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
       auto suf = static_cast<std::string>(index.search_params[i]["override_suffix"]);
       index.search_params[i].erase("override_suffix");
 
+      int max_threads =
+        metric_objective == Objective::THROUGHPUT ? std::thread::hardware_concurrency() : 1;
+
       auto* b = ::benchmark::RegisterBenchmark(
                   index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
                   ->Unit(benchmark::kMillisecond)
-                  ->ThreadRange(1, 32)
-                  ->UseManualTime();
+                  ->UseManualTime()
+                  ->ThreadRange(1, max_threads);
     }
   }
 }

From 392f6a19452bf20d448b7f2c0ce3b571ebb46a96 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 16:19:04 -0400
Subject: [PATCH 10/19] Adding `end_to_end` time and verifying the
 total_queries / end_to_end ~ items_per_second.:qqx

---
 cpp/bench/ann/src/common/benchmark.hpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index a83adffdef..7b9f8e4208 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -245,6 +245,7 @@ void bench_search(::benchmark::State& state,
   std::shared_ptr<buf<std::size_t>> neighbors =
     std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
 
+  auto start = std::chrono::high_resolution_clock::now();
   cuda_timer gpu_timer;
   {
     nvtx_case nvtx{state.name()};
@@ -278,7 +279,11 @@ void bench_search(::benchmark::State& state,
       total_time += elapsed_seconds.count();
     }
   }
-
+  auto end = std::chrono::high_resolution_clock::now();
+  if (state.thread_index() == 0) {
+    auto duration = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
+    state.counters.insert({{"end_to_end", duration}});
+  }
   state.SetItemsProcessed(queries_processed);
   if (cudart.found()) {
     state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
@@ -291,7 +296,7 @@ void bench_search(::benchmark::State& state,
 
   // Use the last thread as a sanity check that all the threads are working.
   if (state.thread_index() == state.threads() - 1) {
-    state.counters.insert({{"k", k}, {"n_queries", n_queries}});
+    //    state.counters.insert({{"k", k}, {"n_queries", n_queries}});
 
     // evaluate recall
     if (dataset->max_k() >= k) {
@@ -385,8 +390,17 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
       auto* b = ::benchmark::RegisterBenchmark(
                   index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
                   ->Unit(benchmark::kMillisecond)
-                  ->UseManualTime()
-                  ->ThreadRange(1, max_threads);
+                  ->ThreadRange(1, max_threads)
+
+                  /**
+                   * The following are important for getting accuracy QPS measurements on both CPU
+                   * and GPU These make sure that
+                   *   - `items_per_second` ~ (`total_queries` / `end_to_end`)
+                   *   - `end_to_end` ~ (`Time` * `Iterations`)
+                   *   -
+                   */
+                  ->MeasureProcessCPUTime()
+                  ->UseRealTime()
     }
   }
 }

From 18b57ccd8f02da0bf49ed269c6a111a5abf03ff6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 23 Oct 2023 18:05:35 -0400
Subject: [PATCH 11/19] More updates

---
 cpp/bench/ann/src/common/benchmark.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 7b9f8e4208..4feb2ab849 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -246,15 +246,15 @@ void bench_search(::benchmark::State& state,
     std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
 
   auto start = std::chrono::high_resolution_clock::now();
-  cuda_timer gpu_timer;
+  //  cuda_timer gpu_timer;
   {
-    nvtx_case nvtx{state.name()};
+    //    nvtx_case nvtx{state.name()};
 
     // TODO: Have the odd threads load the queries backwards just to rule out caching.
     ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
     for (auto _ : state) {
-      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      //      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      //      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
 
       auto start = std::chrono::high_resolution_clock::now();
       // run the search
@@ -264,7 +264,7 @@ void bench_search(::benchmark::State& state,
                      k,
                      neighbors->data + batch_offset * k,
                      distances->data + batch_offset * k,
-                     gpu_timer.stream());
+                     cudaStreamPerThread);
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
@@ -285,9 +285,9 @@ void bench_search(::benchmark::State& state,
     state.counters.insert({{"end_to_end", duration}});
   }
   state.SetItemsProcessed(queries_processed);
-  if (cudart.found()) {
-    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
-  }
+  //  if (cudart.found()) {
+  //    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
+  //  }
 
   // This will be the total number of queries across all threads
   state.counters.insert({{"total_queries", queries_processed}});
@@ -395,12 +395,12 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
                   /**
                    * The following are important for getting accuracy QPS measurements on both CPU
                    * and GPU These make sure that
-                   *   - `items_per_second` ~ (`total_queries` / `end_to_end`)
                    *   - `end_to_end` ~ (`Time` * `Iterations`)
-                   *   -
+                   *   - `items_per_second` ~ (`total_queries` / `end_to_end`)
+                   *   - `Time` = `end_to_end` / `Iterations`
                    */
                   ->MeasureProcessCPUTime()
-                  ->UseRealTime()
+                  ->UseRealTime();
     }
   }
 }

From bda392f7bce440de9c121eeae2a0fa0d4c3c0328 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 25 Oct 2023 19:53:34 -0400
Subject: [PATCH 12/19] Removing cudaStreamPerThread from benchmark.hpp

---
 cpp/bench/ann/CMakeLists.txt           | 18 +++++++-----------
 cpp/bench/ann/src/common/benchmark.hpp | 23 ++++++++++++-----------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 1f98cdbe88..d6a5fddb98 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -15,22 +15,18 @@
 # ##################################################################################################
 # * benchmark options ------------------------------------------------------------------------------
 
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark"
-       OFF
-)
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF)
-option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT
-       "Include faiss' cpu brute-force knn algorithm in benchmark" OFF
-)
-option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark"
-       OFF
+       "Include faiss' cpu brute-force knn algorithm in benchmark" ON
 )
+option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)
 
 option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
-       OFF
+       ON
 )
-option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" OFF)
+option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
 option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 4feb2ab849..b1c63b26d6 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -187,7 +187,7 @@ void bench_search(::benchmark::State& state,
   const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries;
 
   if (!file_exists(index.file)) {
-    throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first.");
+    state.SkipWithError("Index file is missing. Run the benchmark in the build mode first.");
     return;
   }
 
@@ -246,15 +246,15 @@ void bench_search(::benchmark::State& state,
     std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
 
   auto start = std::chrono::high_resolution_clock::now();
-  //  cuda_timer gpu_timer;
+  cuda_timer gpu_timer;
   {
-    //    nvtx_case nvtx{state.name()};
+    nvtx_case nvtx{state.name()};
 
     // TODO: Have the odd threads load the queries backwards just to rule out caching.
     ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
     for (auto _ : state) {
-      //      [[maybe_unused]] auto ntx_lap = nvtx.lap();
-      //      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
+      [[maybe_unused]] auto ntx_lap = nvtx.lap();
+      [[maybe_unused]] auto gpu_lap = gpu_timer.lap();
 
       auto start = std::chrono::high_resolution_clock::now();
       // run the search
@@ -264,7 +264,7 @@ void bench_search(::benchmark::State& state,
                      k,
                      neighbors->data + batch_offset * k,
                      distances->data + batch_offset * k,
-                     cudaStreamPerThread);
+                     gpu_timer.stream());
       } catch (const std::exception& e) {
         state.SkipWithError(std::string(e.what()));
       }
@@ -285,9 +285,9 @@ void bench_search(::benchmark::State& state,
     state.counters.insert({{"end_to_end", duration}});
   }
   state.SetItemsProcessed(queries_processed);
-  //  if (cudart.found()) {
-  //    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
-  //  }
+  if (cudart.found()) {
+    state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}});
+  }
 
   // This will be the total number of queries across all threads
   state.counters.insert({{"total_queries", queries_processed}});
@@ -296,8 +296,6 @@ void bench_search(::benchmark::State& state,
 
   // Use the last thread as a sanity check that all the threads are working.
   if (state.thread_index() == state.threads() - 1) {
-    //    state.counters.insert({{"k", k}, {"n_queries", n_queries}});
-
     // evaluate recall
     if (dataset->max_k() >= k) {
       const std::int32_t* gt          = dataset->gt_set();
@@ -398,6 +396,9 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
                    *   - `end_to_end` ~ (`Time` * `Iterations`)
                    *   - `items_per_second` ~ (`total_queries` / `end_to_end`)
                    *   - `Time` = `end_to_end` / `Iterations`
+                   *
+                   *   - Latency = `Time`
+                   *   - Throughput = `items_per_second`
                    */
                   ->MeasureProcessCPUTime()
                   ->UseRealTime();

From a21f66876e5bcf0bfe82d1a8e29504890bbf1a70 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 11:57:02 -0400
Subject: [PATCH 13/19] Removing thread pool. It's not being used

---
 cpp/bench/ann/src/common/thread_pool.hpp | 65 ------------------------
 1 file changed, 65 deletions(-)

diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
index c28b315919..7eb6d88b96 100644
--- a/cpp/bench/ann/src/common/thread_pool.hpp
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -133,68 +133,3 @@ class FixedThreadPool {
   std::vector<std::thread> threads_;
   std::atomic<bool> finished_{false};
 };
-
-class ThreadPool {
- public:
-  ThreadPool(unsigned num_threads = std::thread::hardware_concurrency())
-  {
-    if (num_threads <= 0) { return; }
-    while (num_threads--) {
-      threads.emplace_back([this] {
-        while (true) {
-          std::unique_lock<std::mutex> lock(mutex);
-          condvar.wait(lock, [this] { return !queue.empty(); });
-          auto task = std::move(queue.front());
-          if (task.valid()) {
-            queue.pop();
-            lock.unlock();
-            // run the task - this cannot throw; any exception
-            // will be stored in the corresponding future
-            task();
-          } else {
-            // an empty task is used to signal end of stream
-            // don't pop it off the top; all threads need to see it
-            break;
-          }
-        }
-      });
-    }
-  }
-
-  template <typename F, typename R = std::result_of_t<F && ()>>
-  std::future<R> run(F&& f) const
-  {
-    auto task   = std::packaged_task<R()>(std::forward<F>(f));
-    auto future = task.get_future();
-    {
-      std::lock_guard<std::mutex> lock(mutex);
-      // conversion to packaged_task<void()> erases the return type
-      // so it can be stored in the queue. the future will still
-      // contain the correct type
-      queue.push(std::packaged_task<void()>(std::move(task)));
-    }
-    condvar.notify_one();
-    return future;
-  }
-
-  ~ThreadPool()
-  {
-    // push a single empty task onto the queue and notify all threads,
-    // then wait for them to terminate
-
-    {
-      std::lock_guard<std::mutex> lock(mutex);
-      queue.push({});
-    }
-    condvar.notify_all();
-    for (auto& thread : threads) {
-      thread.join();
-    }
-  }
-
- private:
-  std::vector<std::thread> threads;
-  mutable std::queue<std::packaged_task<void()>> queue;
-  mutable std::mutex mutex;
-  mutable std::condition_variable condvar;
-};

From f592eb60da5c2563e399766f45a9d884faea911e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 12:42:25 -0400
Subject: [PATCH 14/19] Adding new mode to docs

---
 cpp/bench/ann/src/common/benchmark.hpp   | 1 -
 cpp/bench/ann/src/common/thread_pool.hpp | 2 --
 docs/source/raft_ann_benchmarks.md       | 4 ++++
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index b1c63b26d6..ec52013919 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -27,7 +27,6 @@
 #include <cmath>
 #include <cstdint>
 #include <fstream>
-#include <future>
 #include <limits>
 #include <memory>
 #include <numeric>
diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp
index 7eb6d88b96..c01fa2c32c 100644
--- a/cpp/bench/ann/src/common/thread_pool.hpp
+++ b/cpp/bench/ann/src/common/thread_pool.hpp
@@ -16,12 +16,10 @@
 #pragma once
 
 #include <atomic>
-#include <condition_variable>
 #include <future>
 #include <memory>
 #include <mutex>
 #include <omp.h>
-#include <queue>
 #include <stdexcept>
 #include <thread>
 #include <utility>
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index fadca595fb..4cbdef2652 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -141,6 +141,10 @@ options:
                         run only comma separated list of named algorithms (default: None)
   --indices INDICES     run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
   -f, --force           re-run algorithms even if their results already exist (default: False)
+  -m MODE, --search_mode MODE
+                        run search in 'latency' (measure individual batches) or 
+                        'throughput' (pipeline batches and measure end-to-end) mode.
+                        (default: 'latency')
 ```
 
 `configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset.

From ab6e0d03fd569a877e4c1beae5fc25fec27a4480 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 12:43:01 -0400
Subject: [PATCH 15/19] Fixing "search-mode" option

---
 docs/source/raft_ann_benchmarks.md                       | 2 +-
 python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 4cbdef2652..1e3958b30f 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -141,7 +141,7 @@ options:
                         run only comma separated list of named algorithms (default: None)
   --indices INDICES     run only comma separated list of named indices. parameter `algorithms` is ignored (default: None)
   -f, --force           re-run algorithms even if their results already exist (default: False)
-  -m MODE, --search_mode MODE
+  -m MODE, --search-mode MODE
                         run search in 'latency' (measure individual batches) or 
                         'throughput' (pipeline batches and measure end-to-end) mode.
                         (default: 'latency')
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index e0583d7708..75b7734349 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -216,7 +216,7 @@ def main():
 
     parser.add_argument(
         "-m",
-        "--search_mode",
+        "--search-mode",
         help="run search in 'latency' (measure individual batches) or "
         "'throughput' (pipeline batches and measure end-to-end) mode",
         default="throughput",

From aec2664d48fb3b045f888541bf64a5e1e63f9de1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 13:28:01 -0400
Subject: [PATCH 16/19] Updates based on review feedback

---
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h    |  5 +++--
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h    | 11 ++---------
 cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h | 13 +++----------
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h   | 14 ++------------
 4 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index 66776ee3bc..23cae6352c 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -164,8 +164,9 @@ void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
   appr_alg_->ef_    = param.ef;
   metric_objective_ = param.metric_objective;
 
-  if (metric_objective_ != Objective::LATENCY &&
-      (!thread_pool_ || num_threads_ != param.num_threads)) {
+  bool use_pool = (metric_objective_ == Objective::LATENCY && param.num_threads > 1) &&
+                  (!thread_pool_ || num_threads_ != param.num_threads);
+  if (use_pool) {
     num_threads_ = param.num_threads;
     thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
   }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index a22cb41d46..f1c8154b7c 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -53,18 +53,13 @@ class RaftCagra : public ANN<T> {
   using BuildParam = raft::neighbors::cagra::index_params;
 
   RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1)
-    : ANN<T>(metric, dim),
-      index_params_(param),
-      dimension_(dim),
-      mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull),
-      handle_(cudaStreamPerThread)
+    : ANN<T>(metric, dim), index_params_(param), dimension_(dim), handle_(cudaStreamPerThread)
   {
-    rmm::mr::set_current_device_resource(&mr_);
     index_params_.metric = parse_metric_type(metric);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
   }
 
-  ~RaftCagra() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); }
+  ~RaftCagra() noexcept {}
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
@@ -93,8 +88,6 @@ class RaftCagra : public ANN<T> {
   void load(const std::string&) override;
 
  private:
-  // `mr_` must go first to make sure it dies last
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_resources handle_;
   BuildParam index_params_;
   raft::neighbors::cagra::search_params search_params_;
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index b6df7de068..24b3c69bb6 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -52,18 +52,14 @@ class RaftIvfFlatGpu : public ANN<T> {
   using BuildParam = raft::neighbors::ivf_flat::index_params;
 
   RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param)
-    : ANN<T>(metric, dim),
-      index_params_(param),
-      dimension_(dim),
-      mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+    : ANN<T>(metric, dim), index_params_(param), dimension_(dim)
   {
     index_params_.metric                         = parse_metric_type(metric);
     index_params_.conservative_memory_allocation = true;
-    rmm::mr::set_current_device_resource(&mr_);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
   }
 
-  ~RaftIvfFlatGpu() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); }
+  ~RaftIvfFlatGpu() noexcept {}
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
@@ -90,8 +86,6 @@ class RaftIvfFlatGpu : public ANN<T> {
   void load(const std::string&) override;
 
  private:
-  // `mr_` must go first to make sure it dies last
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_resources handle_;
   BuildParam index_params_;
   raft::neighbors::ivf_flat::search_params search_params_;
@@ -134,10 +128,9 @@ template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::search(
   const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
 {
-  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfFlatGpu*>(this)->mr_;
   static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
   raft::neighbors::ivf_flat::search(
-    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr);
+    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances);
   resource::sync_stream(handle_);
   return;
 }
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 1b74dcf975..e4004b0007 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -54,22 +54,14 @@ class RaftIvfPQ : public ANN<T> {
   using BuildParam = raft::neighbors::ivf_pq::index_params;
 
   RaftIvfPQ(Metric metric, int dim, const BuildParam& param)
-    : ANN<T>(metric, dim),
-      index_params_(param),
-      dimension_(dim),
-      mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+    : ANN<T>(metric, dim), index_params_(param), dimension_(dim)
   {
-    rmm::mr::set_current_device_resource(&mr_);
     index_params_.metric = parse_metric_type(metric);
     RAFT_CUDA_TRY(cudaGetDevice(&device_));
     RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming));
   }
 
-  ~RaftIvfPQ() noexcept
-  {
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_));
-    rmm::mr::set_current_device_resource(mr_.get_upstream());
-  }
+  ~RaftIvfPQ() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); }
 
   void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
 
@@ -97,8 +89,6 @@ class RaftIvfPQ : public ANN<T> {
   void load(const std::string&) override;
 
  private:
-  // `mr_` must go first to make sure it dies last
-  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
   raft::device_resources handle_;
   cudaEvent_t sync_{nullptr};
   BuildParam index_params_;

From 3b8eb2262e2f4dcc175b959152464db9af26b345 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 13:28:33 -0400
Subject: [PATCH 17/19] Apply suggestions from code review

Co-authored-by: Ben Frederickson <github@benfrederickson.com>
---
 cpp/bench/ann/src/common/ann_types.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index f82afb35e9..2c1105a272 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -26,7 +26,7 @@ namespace raft::bench::ann {
 
 enum Objective {
   THROUGHPUT,  // See how many vectors we can push through
-  LATENCY      // See how many
+  LATENCY      // See how fast we can push a vector through
 };
 
 enum class MemoryType {

From 92f140320121427d3d51f921863fd8a96e5330cd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 14:06:00 -0400
Subject: [PATCH 18/19] Update
 python/raft-ann-bench/src/raft-ann-bench/run/__main__.py

Co-authored-by: Ben Frederickson <github@benfrederickson.com>
---
 python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 75b7734349..32517494cb 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -121,7 +121,6 @@ def run_build_and_search(
                 "--search",
                 "--data_prefix=" + dataset_path,
                 "--benchmark_counters_tabular",
-                # "--benchmark_min_time=1x",
                 "--override_kv=k:%s" % k,
                 "--override_kv=n_queries:%s" % batch_size,
                 "--benchmark_min_warmup_time=0.01",

From b813d19a9c11a6cd923323327dd7fb22ff5bdf20 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 27 Oct 2023 16:18:01 -0400
Subject: [PATCH 19/19] Updating docs

---
 cpp/bench/ann/src/common/benchmark.hpp        |  3 +-
 docs/source/raft_ann_benchmarks.md            | 33 +++++++++++++++++++
 .../src/raft-ann-bench/run/__main__.py        |  2 +-
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index ec52013919..10a256bd63 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -155,7 +155,7 @@ void bench_build(::benchmark::State& state,
     }
   }
   state.counters.insert(
-    {{"GPU Time", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}});
+    {{"GPU", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}});
 
   if (state.skipped()) { return; }
   make_sure_parent_dir_exists(index.file);
@@ -367,6 +367,7 @@ void register_build(std::shared_ptr<const Dataset<T>> dataset,
     auto* b = ::benchmark::RegisterBenchmark(
       index.name + suf, bench_build<T>, dataset, index, force_overwrite);
     b->Unit(benchmark::kSecond);
+    b->MeasureProcessCPUTime();
     b->UseRealTime();
   }
 }
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 1e3958b30f..6a436a7213 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -16,6 +16,7 @@ This project provides a benchmark program for various ANN search implementations
   - [End to end: small-scale (<1M to 10M)](#end-to-end-small-scale-benchmarks-1m-to-10m)
   - [End to end: large-scale (>10M)](#end-to-end-large-scale-benchmarks-10m-vectors)
   - [Running with Docker containers](#running-with-docker-containers)
+  - [Evaluating the results](#evaluating-the-results)
 - [Creating and customizing dataset configurations](#creating-and-customizing-dataset-configurations)
 - [Adding a new ANN algorithm](#adding-a-new-ann-algorithm)
 - [Parameter tuning guide](https://docs.rapids.ai/api/raft/nightly/ann_benchmarks_param_tuning/)
@@ -359,6 +360,38 @@ This will drop you into a command line in the container, with the `raft-ann-benc
 
 Additionally, the containers can be run in detached mode without any issue.
 
+
+### Evaluating the results
+
+The benchmarks capture several different measurements. The table below describes each of the measurements for index build benchmarks:
+
+| Name       | Description                                            | 
+|------------|--------------------------------------------------------|
+| Benchmark  | A name that uniquely identifies the benchmark instance | 
+| Time       | Wall-time spent training the index                     | 
+| CPU        | CPU time spent training the index                      |
+| Iterations | Number of iterations (this is usually 1)               |
+| GPU        | GPU time spent building                                |
+| index_size | Number of vectors used to train index |
+
+
+The table below describes each of the measurements for the index search benchmarks:
+
+| Name | Description                                                                                                                                           |
+|------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Benchmark | A name that uniquely identifies the benchmark instance                                                                                                |
+| Time | The average runtime for each batch. This is approximately `end_to_end` / `Iterations`                                                                 |
+| CPU | The average `wall-time`. In `throughput` mode, this is the average `wall-time` spent in each thread.                                                  |
+| Iterations | Total number of batches. This is going to be `total_queres` / `n_queries`                                                                             | 
+| Recall | Proportion of correct neighbors to ground truth neighbors. Note this column is only present if groundtruth file is specified in dataset configuration |
+| items_per_second | Total throughput. This is approximately `total_queries` / `end_to_end`.                                                                               |
+| k | Number of neighbors being queried in each iteration                                                                                                   |
+| end_to_end | Total time taken to run all batches for all iterations                                                                                                | 
+| n_queries | Total number of query vectors in each batch                                                                                                           |
+| total_queries | Total number of vectors queries across all iterations                                                                                                 |
+
+Note that the actual table displayed on the screen may differ slightly as the hyper-parameters will also be displayed for each different combination being benchmarked.
+
 ## Creating and customizing dataset configurations
 
 A single configuration file will often define a set of algorithms, with associated index and search parameters, for a specific dataset. A configuration file uses json format with 4 major parts:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 75b7734349..30d642f3ac 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -105,6 +105,7 @@ def run_build_and_search(
                 "--build",
                 "--data_prefix=" + dataset_path,
                 "--benchmark_out_format=json",
+                "--benchmark_counters_tabular=true",
                 "--benchmark_out="
                 + f"{os.path.join(build_folder, f'{algo}.json')}",
             ]
@@ -121,7 +122,6 @@ def run_build_and_search(
                 "--search",
                 "--data_prefix=" + dataset_path,
                 "--benchmark_counters_tabular",
-                # "--benchmark_min_time=1x",
                 "--override_kv=k:%s" % k,
                 "--override_kv=n_queries:%s" % batch_size,
                 "--benchmark_min_warmup_time=0.01",