From 5f740abfe425a020b0d59c2dc1d4141c0ee224df Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 20 Oct 2023 18:24:18 -0400 Subject: [PATCH 01/19] Adding throughput mode to benhcmarksy --- cpp/bench/ann/src/common/ann_types.hpp | 5 + cpp/bench/ann/src/common/benchmark.hpp | 118 ++++++++++++++---- cpp/bench/ann/src/common/thread_pool.hpp | 73 +++++++++++ .../src/raft-ann-bench/run/__main__.py | 12 ++ 4 files changed, 187 insertions(+), 21 deletions(-) diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp index 33716bd45a..d32c8b64b3 100644 --- a/cpp/bench/ann/src/common/ann_types.hpp +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -24,6 +24,11 @@ namespace raft::bench::ann { +enum Objective { + THROUGHPUT, // See how many vectors we can push through + LATENCY // See how many +}; + enum class MemoryType { Host, HostMmap, diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 4ec977700d..4991bcae71 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -22,6 +22,9 @@ #include +#include + +#include "thread_pool.hpp" #include #include #include @@ -164,7 +167,8 @@ template void bench_search(::benchmark::State& state, std::shared_ptr> dataset, Configuration::Index index, - std::size_t search_param_ix) + std::size_t search_param_ix, + Objective metric_objective) { const auto& sp_json = index.search_params[search_param_ix]; dump_parameters(state, sp_json); @@ -221,21 +225,52 @@ void bench_search(::benchmark::State& state, std::ptrdiff_t batch_offset = 0; std::size_t queries_processed = 0; + cuda_timer gpu_timer; + { + /** + * When the objective is throughput, we want to overlap batches + * as much as possible and measure the end-to-end time from start + * to finish. + * + * When the objective is latency, we want to measure each batch + * individually. Latency is better measured in single-query batches + * but larger batches are still allowed in this mode in order to + * compare against the resulting batch sizes in throughput mode. + */ + bool overlap_queries = metric_objective == Objective::THROUGHPUT; + ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0); + + // TODO: Make this configurable + rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 0); + nvtx_case nvtx{state.name()}; + for (auto _ : state) { - // measure the GPU time using the RAII helper - [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + if (metric_objective == Objective::LATENCY) { + // measure the GPU time using the RAII helper + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + } // run the search try { - algo->search(query_set + batch_offset * dataset->dim(), - n_queries, - k, - neighbors.data + batch_offset * k, - distances.data + batch_offset * k, - gpu_timer.stream()); + cudaStream_t stream = + overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream(); + auto f = [=]() { + algo->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors.data + batch_offset * k, + distances.data + batch_offset * k, + stream); + }; + + if (overlap_queries) { + thread_pool.run(f); + } else { + f(); + } } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } @@ -243,7 +278,23 @@ void bench_search(::benchmark::State& state, batch_offset = (batch_offset + n_queries) % query_set_size; queries_processed += n_queries; } + + /** + * If in throughput mode, wait for all results + */ + if (metric_objective == Objective::THROUGHPUT) { + for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) { + cudaStreamSynchronize(stream_pool.get_stream(i)); + } + + thread_pool.synchronize(); + + // measure the GPU time using the RAII helper + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + } } + state.SetItemsProcessed(queries_processed); state.counters.insert({{"k", k}, {"n_queries", n_queries}}); if (cudart.found()) { @@ -325,14 +376,15 @@ void register_build(std::shared_ptr> dataset, template void register_search(std::shared_ptr> dataset, - std::vector indices) + std::vector indices, + Objective metric_objective) { for (auto index : indices) { for (std::size_t i = 0; i < index.search_params.size(); i++) { auto suf = static_cast(index.search_params[i]["override_suffix"]); index.search_params[i].erase("override_suffix"); - auto* b = - ::benchmark::RegisterBenchmark(index.name + suf, bench_search, dataset, index, i); + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, dataset, index, i, metric_objective); b->Unit(benchmark::kMillisecond); b->UseRealTime(); } @@ -346,7 +398,8 @@ void dispatch_benchmark(const Configuration& conf, bool search_mode, std::string data_prefix, std::string index_prefix, - kv_series override_kv) + kv_series override_kv, + Objective metric_objective) { if (cudart.found()) { for (auto [key, value] : cuda_info()) { @@ -414,7 +467,7 @@ void dispatch_benchmark(const Configuration& conf, index.search_params = apply_overrides(index.search_params, override_kv); index.file = combine_path(index_prefix, index.file); } - register_search(dataset, indices); + register_search(dataset, indices, metric_objective); } } @@ -445,6 +498,7 @@ inline auto run_main(int argc, char** argv) -> int std::string data_prefix = "data"; std::string index_prefix = "index"; std::string new_override_kv = ""; + std::string mode = "latency"; kv_series override_kv{}; char arg0_default[] = "benchmark"; // NOLINT @@ -467,6 +521,7 @@ inline auto run_main(int argc, char** argv) -> int parse_bool_flag(argv[i], "--search", search_mode) || parse_string_flag(argv[i], "--data_prefix", data_prefix) || parse_string_flag(argv[i], "--index_prefix", index_prefix) || + parse_string_flag(argv[i], "--mode", mode) || parse_string_flag(argv[i], "--override_kv", new_override_kv)) { if (!new_override_kv.empty()) { auto kvv = split(new_override_kv, ':'); @@ -486,6 +541,9 @@ inline auto run_main(int argc, char** argv) -> int } } + Objective metric_objective = Objective::LATENCY; + if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } + if (build_mode == search_mode) { log_error("One and only one of --build and --search should be specified"); printf_usage(); @@ -505,14 +563,32 @@ inline auto run_main(int argc, char** argv) -> int std::string dtype = conf.get_dataset_conf().dtype; if (dtype == "float") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); } else if (dtype == "uint8") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); } else if (dtype == "int8") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); } else { log_error("datatype '%s' is not supported", dtype.c_str()); return -1; diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp index efea938d5b..def38ec0b5 100644 --- a/cpp/bench/ann/src/common/thread_pool.hpp +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -16,10 +16,12 @@ #pragma once #include +#include #include #include #include #include +#include #include #include #include @@ -72,6 +74,7 @@ class FixedThreadPool { template void submit(Func f, IdxT len) { + // Run functions in main thread if thread pool has no threads if (threads_.empty()) { for (IdxT i = 0; i < len; ++i) { f(i); @@ -84,6 +87,7 @@ class FixedThreadPool { const IdxT items_per_thread = len / (num_threads + 1); std::atomic cnt(items_per_thread * num_threads); + // Wrap function auto wrapped_f = [&](IdxT start, IdxT end) { for (IdxT i = start; i < end; ++i) { f(i); @@ -129,3 +133,72 @@ class FixedThreadPool { std::vector threads_; std::atomic finished_{false}; }; + +class ThreadPool { + public: + ThreadPool(unsigned num_threads = std::thread::hardware_concurrency()) + { + while (num_threads--) { + threads.emplace_back([this] { + while (true) { + std::unique_lock lock(mutex); + condvar.wait(lock, [this] { return !queue.empty(); }); + auto task = std::move(queue.front()); + if (task.valid()) { + queue.pop(); + lock.unlock(); + // run the task - this cannot throw; any exception + // will be stored in the corresponding future + task(); + } else { + // an empty task is used to signal end of stream + // don't pop it off the top; all threads need to see it + break; + } + } + }); + } + } + + template > + std::future run(F&& f) const + { + auto task = std::packaged_task(std::forward(f)); + auto future = task.get_future(); + { + std::lock_guard lock(mutex); + // conversion to packaged_task erases the return type + // so it can be stored in the queue. the future will still + // contain the correct type + queue.push(std::packaged_task(std::move(task))); + } + condvar.notify_one(); + return future; + } + + void synchronize() + { + condvar.notify_all(); + for (auto& thread : threads) { + thread.join(); + } + } + + ~ThreadPool() + { + // push a single empty task onto the queue and notify all threads, + // then wait for them to terminate + { + std::lock_guard lock(mutex); + queue.push({}); + } + + synchronize(); + } + + private: + std::vector threads; + mutable std::queue> queue; + mutable std::mutex mutex; + mutable std::condition_variable condvar; +}; diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index a0d4fabb77..d1f85f05db 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -78,6 +78,7 @@ def run_build_and_search( search, k, batch_size, + mode="throughput", ): for executable, ann_executable_path, algo in executables_to_run.keys(): # Need to write temporary configuration @@ -126,6 +127,7 @@ def run_build_and_search( "--benchmark_out_format=json", "--benchmark_out=" + f"{os.path.join(search_folder, f'{algo}.json')}", + "--mode=" % str(mode), ] if force: cmd = cmd + ["--overwrite"] @@ -211,6 +213,14 @@ def main(): action="store_true", ) + parser.add_argument( + "-m", + "--search_mode", + help="run search in 'latency' (measure individual batches) or " + "'throughput' (pipeline batches and measure end-to-end) mode", + default="throughput", + ) + args = parser.parse_args() # If both build and search are not provided, @@ -222,6 +232,7 @@ def main(): build = args.build search = args.search + mode = args.search_mode k = args.count batch_size = args.batch_size @@ -316,6 +327,7 @@ def main(): search, k, batch_size, + mode, ) From 1f75edecad62a2f5f41bac2699c638a8e7f16f07 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 20 Oct 2023 18:42:40 -0400 Subject: [PATCH 02/19] Adding docs --- cpp/bench/ann/src/common/benchmark.hpp | 6 +++++- cpp/bench/ann/src/common/thread_pool.hpp | 1 + python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 4991bcae71..3dedbdd014 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -339,6 +339,7 @@ inline void printf_usage() " [--data_prefix=]\n" " [--index_prefix=]\n" " [--override_kv=]\n" + " [--mode=\n" " .json\n" "\n" "Note the non-standard benchmark parameters:\n" @@ -353,7 +354,10 @@ inline void printf_usage() " --override_kv=:" " override a build/search key one or more times multiplying the number of configurations;" " you can use this parameter multiple times to get the Cartesian product of benchmark" - " configs.\n"); + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n"); } template diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp index def38ec0b5..d904be2000 100644 --- a/cpp/bench/ann/src/common/thread_pool.hpp +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -188,6 +188,7 @@ class ThreadPool { { // push a single empty task onto the queue and notify all threads, // then wait for them to terminate + { std::lock_guard lock(mutex); queue.push({}); diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index d1f85f05db..02cfadf809 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -127,7 +127,7 @@ def run_build_and_search( "--benchmark_out_format=json", "--benchmark_out=" + f"{os.path.join(search_folder, f'{algo}.json')}", - "--mode=" % str(mode), + "--mode=%s" % mode, ] if force: cmd = cmd + ["--overwrite"] From c11685c3d46d8acd67d9082901eafea16dfa82ad Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 20 Oct 2023 21:35:36 -0400 Subject: [PATCH 03/19] Will probably have to integrate stream logic into the threads of the thread pool --- cpp/bench/ann/CMakeLists.txt | 24 +++++++------- cpp/bench/ann/src/common/ann_types.hpp | 3 +- cpp/bench/ann/src/common/benchmark.hpp | 33 +++++++++++++++---- cpp/bench/ann/src/common/thread_pool.hpp | 15 +++------ cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 26 ++++++++++----- cpp/bench/ann/src/raft/raft_benchmark.cu | 10 +++++- .../src/raft-ann-bench/run/__main__.py | 1 + 7 files changed, 74 insertions(+), 38 deletions(-) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 502f371a25..1f98cdbe88 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -15,18 +15,22 @@ # ################################################################################################## # * benchmark options ------------------------------------------------------------------------------ -option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" + OFF +) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF) option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT - "Include faiss' cpu brute-force knn algorithm in benchmark" ON + "Include faiss' cpu brute-force knn algorithm in benchmark" OFF +) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" + OFF ) -option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark" - ON + OFF ) -option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" OFF) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) @@ -106,10 +110,8 @@ if(RAFT_ANN_BENCH_USE_GGNN) endif() if(RAFT_ANN_BENCH_USE_FAISS) - # We need to ensure that faiss has all the conda - # information. So we currently use the very ugly - # hammer of `link_libraries` to ensure that all - # targets in this directory and the faiss directory + # We need to ensure that faiss has all the conda information. So we currently use the very ugly + # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory # will have the conda includes/link dirs link_libraries($) include(cmake/thirdparty/get_faiss.cmake) diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp index d32c8b64b3..f192d56e89 100644 --- a/cpp/bench/ann/src/common/ann_types.hpp +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -84,7 +84,8 @@ template class ANN : public AnnBase { public: struct AnnSearchParam { - virtual ~AnnSearchParam() = default; + Objective metric_objective = Objective::LATENCY; + virtual ~AnnSearchParam() = default; [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; }; }; diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 3dedbdd014..e5b2014678 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -20,6 +20,8 @@ #include "dataset.hpp" #include "util.hpp" +#include + #include #include @@ -29,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -200,7 +203,8 @@ void bench_search(::benchmark::State& state, algo->load(index_file); current_algo = std::move(ualgo); } - search_param = ann::create_search_param(index.algo, sp_json); + search_param = ann::create_search_param(index.algo, sp_json); + search_param->metric_objective = metric_objective; } catch (const std::exception& e) { return state.SkipWithError("Failed to create an algo: " + std::string(e.what())); } @@ -228,7 +232,10 @@ void bench_search(::benchmark::State& state, cuda_timer gpu_timer; + uint32_t start = raft::curTimeMillis(); + { + std::vector> futures; /** * When the objective is throughput, we want to overlap batches * as much as possible and measure the end-to-end time from start @@ -243,10 +250,11 @@ void bench_search(::benchmark::State& state, ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0); // TODO: Make this configurable - rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 0); + rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 1); nvtx_case nvtx{state.name()}; + printf("Running state...\n"); for (auto _ : state) { if (metric_objective == Objective::LATENCY) { // measure the GPU time using the RAII helper @@ -257,7 +265,7 @@ void bench_search(::benchmark::State& state, try { cudaStream_t stream = overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream(); - auto f = [=]() { + auto f = [&]() { algo->search(query_set + batch_offset * dataset->dim(), n_queries, k, @@ -267,7 +275,7 @@ void bench_search(::benchmark::State& state, }; if (overlap_queries) { - thread_pool.run(f); + futures.emplace_back(thread_pool.run(f)); } else { f(); } @@ -279,22 +287,35 @@ void bench_search(::benchmark::State& state, queries_processed += n_queries; } + printf("Done running state\n"); + /** * If in throughput mode, wait for all results */ if (metric_objective == Objective::THROUGHPUT) { + // Synchronize all the threads. + for (auto& f : futures) { + f.wait(); + } + + std::cout << "Futures size: " << futures.size() << std::endl; + futures.clear(); + for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) { cudaStreamSynchronize(stream_pool.get_stream(i)); } - thread_pool.synchronize(); - + printf("collecting metrics\n"); // measure the GPU time using the RAII helper [[maybe_unused]] auto ntx_lap = nvtx.lap(); [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); } } + uint32_t stop = raft::curTimeMillis(); + + std::cout << "Took: " << (stop - start) << std::endl; + state.SetItemsProcessed(queries_processed); state.counters.insert({{"k", k}, {"n_queries", n_queries}}); if (cudart.found()) { diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp index d904be2000..c28b315919 100644 --- a/cpp/bench/ann/src/common/thread_pool.hpp +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -138,6 +138,7 @@ class ThreadPool { public: ThreadPool(unsigned num_threads = std::thread::hardware_concurrency()) { + if (num_threads <= 0) { return; } while (num_threads--) { threads.emplace_back([this] { while (true) { @@ -176,14 +177,6 @@ class ThreadPool { return future; } - void synchronize() - { - condvar.notify_all(); - for (auto& thread : threads) { - thread.join(); - } - } - ~ThreadPool() { // push a single empty task onto the queue and notify all threads, @@ -193,8 +186,10 @@ class ThreadPool { std::lock_guard lock(mutex); queue.push({}); } - - synchronize(); + condvar.notify_all(); + for (auto& thread : threads) { + thread.join(); + } } private: diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index df44605493..81c50f2c2e 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -103,6 +103,7 @@ class HnswLib : public ANN { int m_; int num_threads_; std::unique_ptr thread_pool_; + Objective metric_objective_; }; template @@ -159,10 +160,12 @@ void HnswLib::build(const T* dataset, size_t nrow, cudaStream_t) template void HnswLib::set_search_param(const AnnSearchParam& param_) { - auto param = dynamic_cast(param_); - appr_alg_->ef_ = param.ef; + auto param = dynamic_cast(param_); + appr_alg_->ef_ = param.ef; + metric_objective_ = param.metric_objective; - if (!thread_pool_ || num_threads_ != param.num_threads) { + if (metric_objective_ != Objective::LATENCY && + (!thread_pool_ || num_threads_ != param.num_threads)) { num_threads_ = param.num_threads; thread_pool_ = std::make_unique(num_threads_); } @@ -172,12 +175,17 @@ template void HnswLib::search( const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const { - thread_pool_->submit( - [&](int i) { - // hnsw can only handle a single vector at a time. - get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); - }, - batch_size); + auto f = [&](int i) { + // hnsw can only handle a single vector at a time. + get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); + }; + if (metric_objective_ == Objective::LATENCY) { + thread_pool_->submit(f, batch_size); + } else { + for (int i = 0; i < batch_size; i++) { + f(i); + } + } } template diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index fa20c5c223..3b9bcc7e15 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -272,5 +272,13 @@ REGISTER_ALGO_INSTANCE(std::uint8_t); #ifdef ANN_BENCH_BUILD_MAIN #include "../common/benchmark.hpp" -int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + rmm::mr::set_current_device_resource( + &pool_mr); // Updates the current device resource pointer to `pool_mr` + return raft::bench::ann::run_main(argc, argv); +} #endif diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index 02cfadf809..05858d0ce9 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -121,6 +121,7 @@ def run_build_and_search( "--search", "--data_prefix=" + dataset_path, "--benchmark_counters_tabular", + "--benchmark_min_time=1x", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, "--benchmark_min_warmup_time=0.01", From 7d8c907fe0e4e25b05789c54d14824c43b4bf022 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sun, 22 Oct 2023 19:56:45 -0400 Subject: [PATCH 04/19] Still some work to do and need to understand how googlebench multithreads the processing a bit better, but so far things are looking good. --- cpp/bench/ann/src/common/benchmark.hpp | 154 +++++++----------- cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 5 +- .../src/raft-ann-bench/run/__main__.py | 2 +- 3 files changed, 60 insertions(+), 101 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index e5b2014678..50d3ef00b1 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -168,13 +168,16 @@ void bench_build(::benchmark::State& state, template void bench_search(::benchmark::State& state, - std::shared_ptr> dataset, Configuration::Index index, std::size_t search_param_ix, - Objective metric_objective) + std::shared_ptr> dataset) { + std::ptrdiff_t batch_offset = 0; + std::atomic queries_processed(0); + const auto& sp_json = index.search_params[search_param_ix]; - dump_parameters(state, sp_json); + + if (state.thread_index() == 0) { dump_parameters(state, sp_json); } // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp const std::uint32_t k = sp_json["k"]; @@ -184,7 +187,7 @@ void bench_search(::benchmark::State& state, const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries; if (!file_exists(index.file)) { - state.SkipWithError("Index file is missing. Run the benchmark in the build mode first."); + throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first."); return; } // algo is static to cache it between close search runs to save time on index loading @@ -193,7 +196,8 @@ void bench_search(::benchmark::State& state, current_algo.reset(); index_file = index.file; } - ANN* algo; + + ANN* algo; // TODO: Just have one thread load this. std::unique_ptr::AnnSearchParam> search_param; try { if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { @@ -203,15 +207,17 @@ void bench_search(::benchmark::State& state, algo->load(index_file); current_algo = std::move(ualgo); } - search_param = ann::create_search_param(index.algo, sp_json); - search_param->metric_objective = metric_objective; + search_param = ann::create_search_param(index.algo, sp_json); + // search_param->metric_objective = metric_objective; } catch (const std::exception& e) { - return state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + throw std::runtime_error("Failed to create an algo: " + std::string(e.what())); } algo->set_search_param(*search_param); const auto algo_property = parse_algo_property(algo->get_preference(), sp_json); const T* query_set = dataset->query_set(algo_property.query_memory_type); + + // TODO: Have 1 thread create and load these. buf distances{algo_property.query_memory_type, k * query_set_size}; buf neighbors{algo_property.query_memory_type, k * query_set_size}; @@ -220,22 +226,15 @@ void bench_search(::benchmark::State& state, algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type), dataset->base_set_size()); } catch (const std::exception& ex) { - state.SkipWithError("The algorithm '" + index.name + - "' requires the base set, but it's not available. " + - "Exception: " + std::string(ex.what())); + throw std::runtime_error("The algorithm '" + index.name + + "' requires the base set, but it's not available. " + + "Exception: " + std::string(ex.what())); return; } } - std::ptrdiff_t batch_offset = 0; - std::size_t queries_processed = 0; - cuda_timer gpu_timer; - - uint32_t start = raft::curTimeMillis(); - { - std::vector> futures; /** * When the objective is throughput, we want to overlap batches * as much as possible and measure the end-to-end time from start @@ -246,39 +245,21 @@ void bench_search(::benchmark::State& state, * but larger batches are still allowed in this mode in order to * compare against the resulting batch sizes in throughput mode. */ - bool overlap_queries = metric_objective == Objective::THROUGHPUT; - ThreadPool thread_pool(overlap_queries ? std::thread::hardware_concurrency() : 0); - - // TODO: Make this configurable - rmm::cuda_stream_pool stream_pool(overlap_queries ? std::thread::hardware_concurrency() : 1); nvtx_case nvtx{state.name()}; - printf("Running state...\n"); + // Multithreading starts in the benchmark loop for (auto _ : state) { - if (metric_objective == Objective::LATENCY) { - // measure the GPU time using the RAII helper - [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); - } + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); // run the search try { - cudaStream_t stream = - overlap_queries ? stream_pool.get_stream().value() : gpu_timer.stream(); - auto f = [&]() { - algo->search(query_set + batch_offset * dataset->dim(), - n_queries, - k, - neighbors.data + batch_offset * k, - distances.data + batch_offset * k, - stream); - }; - - if (overlap_queries) { - futures.emplace_back(thread_pool.run(f)); - } else { - f(); - } + algo->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors.data + batch_offset * k, + distances.data + batch_offset * k, + gpu_timer.stream()); } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } @@ -286,67 +267,42 @@ void bench_search(::benchmark::State& state, batch_offset = (batch_offset + n_queries) % query_set_size; queries_processed += n_queries; } - - printf("Done running state\n"); - - /** - * If in throughput mode, wait for all results - */ - if (metric_objective == Objective::THROUGHPUT) { - // Synchronize all the threads. - for (auto& f : futures) { - f.wait(); - } - - std::cout << "Futures size: " << futures.size() << std::endl; - futures.clear(); - - for (std::size_t i = 0; i < stream_pool.get_pool_size(); i++) { - cudaStreamSynchronize(stream_pool.get_stream(i)); - } - - printf("collecting metrics\n"); - // measure the GPU time using the RAII helper - [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); - } } - uint32_t stop = raft::curTimeMillis(); - - std::cout << "Took: " << (stop - start) << std::endl; - + state.counters.insert({{"total_queries", queries_processed.load()}}); state.SetItemsProcessed(queries_processed); - state.counters.insert({{"k", k}, {"n_queries", n_queries}}); if (cudart.found()) { state.counters.insert({{"GPU Time", gpu_timer.total_time() / state.iterations()}, - {"GPU QPS", queries_processed / gpu_timer.total_time()}}); + {"GPU QPS", queries_processed.load() / gpu_timer.total_time()}}); } if (state.skipped()) { return; } - // evaluate recall - if (dataset->max_k() >= k) { - const std::int32_t* gt = dataset->gt_set(); - const std::uint32_t max_k = dataset->max_k(); - buf neighbors_host = neighbors.move(MemoryType::Host); - - std::size_t rows = std::min(queries_processed, query_set_size); - std::size_t match_count = 0; - std::size_t total_count = rows * static_cast(k); - for (std::size_t i = 0; i < rows; i++) { - for (std::uint32_t j = 0; j < k; j++) { - auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); - for (std::uint32_t l = 0; l < k; l++) { - auto exp_idx = gt[i * max_k + l]; - if (act_idx == exp_idx) { - match_count++; - break; + if (state.thread_index() == 0) { + state.counters.insert({{"k", k}, {"batch_size", n_queries}}); + // evaluate recall + if (dataset->max_k() >= k) { + const std::int32_t* gt = dataset->gt_set(); + const std::uint32_t max_k = dataset->max_k(); + buf neighbors_host = neighbors.move(MemoryType::Host); + + std::size_t rows = std::min(queries_processed.load(), query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + for (std::size_t i = 0; i < rows; i++) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; + } } } } + double actual_recall = static_cast(match_count) / static_cast(total_count); + state.counters.insert({{"Recall", actual_recall}}); } - double actual_recall = static_cast(match_count) / static_cast(total_count); - state.counters.insert({{"Recall", actual_recall}}); } } @@ -408,10 +364,12 @@ void register_search(std::shared_ptr> dataset, for (std::size_t i = 0; i < index.search_params.size(); i++) { auto suf = static_cast(index.search_params[i]["override_suffix"]); index.search_params[i].erase("override_suffix"); - auto* b = ::benchmark::RegisterBenchmark( - index.name + suf, bench_search, dataset, index, i, metric_objective); - b->Unit(benchmark::kMillisecond); - b->UseRealTime(); + + auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search, index, i, dataset) + ->Unit(benchmark::kMillisecond) + ->ThreadRange(1, 16) + ->UseRealTime(); + std::cout << "Done registering index " << i << std::endl; } } } diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index 19c5151186..a22cb41d46 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -56,7 +56,8 @@ class RaftCagra : public ANN { : ANN(metric, dim), index_params_(param), dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull), + handle_(cudaStreamPerThread) { rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); @@ -170,7 +171,7 @@ void RaftCagra::search( neighbors_IdxT, batch_size * k, raft::cast_op(), - resource::get_cuda_stream(handle_)); + raft::resource::get_cuda_stream(handle_)); } handle_.sync_stream(); diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index 05858d0ce9..e0583d7708 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -121,7 +121,7 @@ def run_build_and_search( "--search", "--data_prefix=" + dataset_path, "--benchmark_counters_tabular", - "--benchmark_min_time=1x", + # "--benchmark_min_time=1x", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, "--benchmark_min_warmup_time=0.01", From 2d6e0ca3080d95a6c3c97a3b0b7ee93635efb37c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 00:14:12 -0400 Subject: [PATCH 05/19] Successfully having one thread load the data --- cpp/bench/ann/src/common/ann_types.hpp | 9 +- cpp/bench/ann/src/common/benchmark.hpp | 689 ++++++++++---------- cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 2 +- 3 files changed, 365 insertions(+), 335 deletions(-) diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp index f192d56e89..f82afb35e9 100644 --- a/cpp/bench/ann/src/common/ann_types.hpp +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -64,10 +64,17 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType } } -struct AlgoProperty { +class AlgoProperty { + public: + inline AlgoProperty() {} + inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_) + : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_) + { + } MemoryType dataset_memory_type; // neighbors/distances should have same memory type as queries MemoryType query_memory_type; + virtual ~AlgoProperty() = default; }; class AnnBase { diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 50d3ef00b1..5c755108af 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -28,6 +28,7 @@ #include "thread_pool.hpp" #include +#include #include #include #include @@ -42,6 +43,7 @@ namespace raft::bench::ann { static inline std::unique_ptr current_algo{nullptr}; +static inline std::shared_ptr current_algo_props{nullptr}; using kv_series = std::vector>>; @@ -170,10 +172,13 @@ template void bench_search(::benchmark::State& state, Configuration::Index index, std::size_t search_param_ix, - std::shared_ptr> dataset) + std::shared_ptr> dataset, + Objective metric_objective) { - std::ptrdiff_t batch_offset = 0; - std::atomic queries_processed(0); + std::ptrdiff_t batch_offset = 0; + std::size_t queries_processed = 0; + + double total_time = 0; const auto& sp_json = index.search_params[search_param_ix]; @@ -190,381 +195,381 @@ void bench_search(::benchmark::State& state, throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first."); return; } - // algo is static to cache it between close search runs to save time on index loading - static std::string index_file = ""; - if (index.file != index_file) { - current_algo.reset(); - index_file = index.file; - } - ANN* algo; // TODO: Just have one thread load this. - std::unique_ptr::AnnSearchParam> search_param; - try { - if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { - auto ualgo = ann::create_algo( - index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); - algo = ualgo.get(); - algo->load(index_file); - current_algo = std::move(ualgo); + /** + * Make sure the first thread loads the algo and dataset + */ + if (state.thread_index() == 0) { + // algo is static to cache it between close search runs to save time on index loading + static std::string index_file = ""; + if (index.file != index_file) { + current_algo.reset(); + index_file = index.file; } - search_param = ann::create_search_param(index.algo, sp_json); - // search_param->metric_objective = metric_objective; - } catch (const std::exception& e) { - throw std::runtime_error("Failed to create an algo: " + std::string(e.what())); - } - algo->set_search_param(*search_param); - const auto algo_property = parse_algo_property(algo->get_preference(), sp_json); - const T* query_set = dataset->query_set(algo_property.query_memory_type); - - // TODO: Have 1 thread create and load these. - buf distances{algo_property.query_memory_type, k * query_set_size}; - buf neighbors{algo_property.query_memory_type, k * query_set_size}; - - if (search_param->needs_dataset()) { + std::unique_ptr::AnnSearchParam> search_param; + ANN* algo; try { - algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type), - dataset->base_set_size()); - } catch (const std::exception& ex) { - throw std::runtime_error("The algorithm '" + index.name + - "' requires the base set, but it's not available. " + - "Exception: " + std::string(ex.what())); - return; + if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { + auto ualgo = ann::create_algo( + index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); + algo = ualgo.get(); + algo->load(index_file); + current_algo = std::move(ualgo); + } + search_param = ann::create_search_param(index.algo, sp_json); + search_param->metric_objective = metric_objective; + } catch (const std::exception& e) { + state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + } + algo->set_search_param(*search_param); + auto algo_property = parse_algo_property(algo->get_preference(), sp_json); + current_algo_props = std::make_shared(algo_property.dataset_memory_type, + algo_property.query_memory_type); + if (search_param->needs_dataset()) { + try { + algo->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type), + dataset->base_set_size()); + } catch (const std::exception& ex) { + state.SkipWithError("The algorithm '" + index.name + + "' requires the base set, but it's not available. " + + "Exception: " + std::string(ex.what())); + return; + } } } + const auto algo_property = *current_algo_props; + const T* query_set = dataset->query_set(algo_property.query_memory_type); + + /** + * Each thread will manage its own outputs + */ + std::shared_ptr> distances = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); + std::shared_ptr> neighbors = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); + cuda_timer gpu_timer; { - /** - * When the objective is throughput, we want to overlap batches - * as much as possible and measure the end-to-end time from start - * to finish. - * - * When the objective is latency, we want to measure each batch - * individually. Latency is better measured in single-query batches - * but larger batches are still allowed in this mode in order to - * compare against the resulting batch sizes in throughput mode. - */ - nvtx_case nvtx{state.name()}; - // Multithreading starts in the benchmark loop for (auto _ : state) { [[maybe_unused]] auto ntx_lap = nvtx.lap(); [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + + ANN* algo = dynamic_cast*>(current_algo.get()); + auto start = std::chrono::high_resolution_clock::now(); // run the search try { algo->search(query_set + batch_offset * dataset->dim(), n_queries, k, - neighbors.data + batch_offset * k, - distances.data + batch_offset * k, + neighbors->data + batch_offset * k, + distances->data + batch_offset * k, gpu_timer.stream()); } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = std::chrono::duration_cast>(end - start); // advance to the next batch batch_offset = (batch_offset + n_queries) % query_set_size; queries_processed += n_queries; + state.SetIterationTime(elapsed_seconds.count()); + total_time += elapsed_seconds.count(); } } - state.counters.insert({{"total_queries", queries_processed.load()}}); state.SetItemsProcessed(queries_processed); if (cudart.found()) { - state.counters.insert({{"GPU Time", gpu_timer.total_time() / state.iterations()}, - {"GPU QPS", queries_processed.load() / gpu_timer.total_time()}}); - } - if (state.skipped()) { return; } + state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}, - if (state.thread_index() == 0) { - state.counters.insert({{"k", k}, {"batch_size", n_queries}}); - // evaluate recall - if (dataset->max_k() >= k) { - const std::int32_t* gt = dataset->gt_set(); - const std::uint32_t max_k = dataset->max_k(); - buf neighbors_host = neighbors.move(MemoryType::Host); - - std::size_t rows = std::min(queries_processed.load(), query_set_size); - std::size_t match_count = 0; - std::size_t total_count = rows * static_cast(k); - for (std::size_t i = 0; i < rows; i++) { - for (std::uint32_t j = 0; j < k; j++) { - auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); - for (std::uint32_t l = 0; l < k; l++) { - auto exp_idx = gt[i * max_k + l]; - if (act_idx == exp_idx) { - match_count++; - break; +// // Using gpu_timer.total_time() isn't really the most fair comparison. +// {"GPU QPS", queries_processed / total_time }}); + } + + // This will be the total number of queries across all threads + state.counters.insert({{"total_queries", queries_processed}}); + + if (state.skipped()) { + return; } + + if (state.thread_index() == 0) { + state.counters.insert({{"k", k}, {"n_queries", n_queries}}); + + // evaluate recall + if (dataset->max_k() >= k) { + const std::int32_t* gt = dataset->gt_set(); + const std::uint32_t max_k = dataset->max_k(); + buf neighbors_host = neighbors->move(MemoryType::Host); + std::size_t rows = std::min(queries_processed, query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + for (std::size_t i = 0; i < rows; i++) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; + } } } } + double actual_recall = static_cast(match_count) / static_cast(total_count); + state.counters.insert({{"Recall", actual_recall}}); } - double actual_recall = static_cast(match_count) / static_cast(total_count); - state.counters.insert({{"Recall", actual_recall}}); } } -} -inline void printf_usage() -{ - ::benchmark::PrintDefaultHelp(); - fprintf( - stdout, - " [--build|--search] \n" - " [--overwrite]\n" - " [--data_prefix=]\n" - " [--index_prefix=]\n" - " [--override_kv=]\n" - " [--mode=\n" - " .json\n" - "\n" - "Note the non-standard benchmark parameters:\n" - " --build: build mode, will build index\n" - " --search: search mode, will search using the built index\n" - " one and only one of --build and --search should be specified\n" - " --overwrite: force overwriting existing index files\n" - " --data_prefix=:" - " prepend to dataset file paths specified in the .json (default = 'data/').\n" - " --index_prefix=:" - " prepend to index file paths specified in the .json (default = 'index/').\n" - " --override_kv=:" - " override a build/search key one or more times multiplying the number of configurations;" - " you can use this parameter multiple times to get the Cartesian product of benchmark" - " configs.\n" - " --mode=" - " run the benchmarks in latency (accumulate times spent in each batch) or " - " throughput (pipeline batches and measure end-to-end) mode\n"); -} - -template -void register_build(std::shared_ptr> dataset, - std::vector indices, - bool force_overwrite) -{ - for (auto index : indices) { - auto suf = static_cast(index.build_param["override_suffix"]); - auto file_suf = suf; - index.build_param.erase("override_suffix"); - std::replace(file_suf.begin(), file_suf.end(), '/', '-'); - index.file += file_suf; - auto* b = ::benchmark::RegisterBenchmark( - index.name + suf, bench_build, dataset, index, force_overwrite); - b->Unit(benchmark::kSecond); - b->UseRealTime(); + inline void printf_usage() + { + ::benchmark::PrintDefaultHelp(); + fprintf( + stdout, + " [--build|--search] \n" + " [--overwrite]\n" + " [--data_prefix=]\n" + " [--index_prefix=]\n" + " [--override_kv=]\n" + " [--mode=\n" + " .json\n" + "\n" + "Note the non-standard benchmark parameters:\n" + " --build: build mode, will build index\n" + " --search: search mode, will search using the built index\n" + " one and only one of --build and --search should be specified\n" + " --overwrite: force overwriting existing index files\n" + " --data_prefix=:" + " prepend to dataset file paths specified in the .json (default = " + "'data/').\n" + " --index_prefix=:" + " prepend to index file paths specified in the .json (default = " + "'index/').\n" + " --override_kv=:" + " override a build/search key one or more times multiplying the number of configurations;" + " you can use this parameter multiple times to get the Cartesian product of benchmark" + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n"); } -} -template -void register_search(std::shared_ptr> dataset, - std::vector indices, - Objective metric_objective) -{ - for (auto index : indices) { - for (std::size_t i = 0; i < index.search_params.size(); i++) { - auto suf = static_cast(index.search_params[i]["override_suffix"]); - index.search_params[i].erase("override_suffix"); - - auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search, index, i, dataset) - ->Unit(benchmark::kMillisecond) - ->ThreadRange(1, 16) - ->UseRealTime(); - std::cout << "Done registering index " << i << std::endl; - } + template + void register_build(std::shared_ptr> dataset, + std::vector indices, + bool force_overwrite) + { + for (auto index : indices) { + auto suf = static_cast(index.build_param["override_suffix"]); + auto file_suf = suf; + index.build_param.erase("override_suffix"); + std::replace(file_suf.begin(), file_suf.end(), '/', '-'); + index.file += file_suf; + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_build, dataset, index, force_overwrite); + b->Unit(benchmark::kSecond); + b->UseRealTime(); + } } -} -template -void dispatch_benchmark(const Configuration& conf, - bool force_overwrite, - bool build_mode, - bool search_mode, - std::string data_prefix, - std::string index_prefix, - kv_series override_kv, - Objective metric_objective) -{ - if (cudart.found()) { - for (auto [key, value] : cuda_info()) { - ::benchmark::AddCustomContext(key, value); - } - } - const auto dataset_conf = conf.get_dataset_conf(); - auto base_file = combine_path(data_prefix, dataset_conf.base_file); - auto query_file = combine_path(data_prefix, dataset_conf.query_file); - auto gt_file = dataset_conf.groundtruth_neighbors_file; - if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } - auto dataset = std::make_shared>(dataset_conf.name, - base_file, - dataset_conf.subset_first_row, - dataset_conf.subset_size, - query_file, - dataset_conf.distance, - gt_file); - ::benchmark::AddCustomContext("dataset", dataset_conf.name); - ::benchmark::AddCustomContext("distance", dataset_conf.distance); - std::vector indices = conf.get_indices(); - if (build_mode) { - if (file_exists(base_file)) { - log_info("Using the dataset file '%s'", base_file.c_str()); - ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); - ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); - } else { - log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.", - base_file.c_str()); - } - std::vector more_indices{}; - for (auto& index : indices) { - for (auto param : apply_overrides(index.build_param, override_kv)) { - auto modified_index = index; - modified_index.build_param = param; - modified_index.file = combine_path(index_prefix, modified_index.file); - more_indices.push_back(modified_index); - } - } - register_build(dataset, more_indices, force_overwrite); - } else if (search_mode) { - if (file_exists(query_file)) { - log_info("Using the query file '%s'", query_file.c_str()); - ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size())); - ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); - if (gt_file.has_value()) { - if (file_exists(*gt_file)) { - log_info("Using the ground truth file '%s'", gt_file->c_str()); - ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); - } else { - log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", - gt_file->c_str()); + template + void register_search(std::shared_ptr> dataset, + std::vector indices, + Objective metric_objective) + { + for (auto index : indices) { + for (std::size_t i = 0; i < index.search_params.size(); i++) { + auto suf = static_cast(index.search_params[i]["override_suffix"]); + index.search_params[i].erase("override_suffix"); + + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, index, i, dataset, metric_objective) + ->Unit(benchmark::kMillisecond) + ->ThreadRange(1, 32) + ->UseManualTime(); + } } - } else { - log_warn( - "Ground truth file is not provided; the recall won't be reported. NB: use " - "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the path to " - "the ground truth in your conf.json."); - } - } else { - log_warn("Query file '%s' does not exist; benchmarking search is impossible.", - query_file.c_str()); - } - for (auto& index : indices) { - index.search_params = apply_overrides(index.search_params, override_kv); - index.file = combine_path(index_prefix, index.file); - } - register_search(dataset, indices, metric_objective); } -} -inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool -{ - if (strcmp(arg, pat) == 0) { - result = true; - return true; + template + void dispatch_benchmark(const Configuration& conf, + bool force_overwrite, + bool build_mode, + bool search_mode, + std::string data_prefix, + std::string index_prefix, + kv_series override_kv, + Objective metric_objective) + { + if (cudart.found()) { + for (auto [key, value] : cuda_info()) { + ::benchmark::AddCustomContext(key, value); + } + } + const auto dataset_conf = conf.get_dataset_conf(); + auto base_file = combine_path(data_prefix, dataset_conf.base_file); + auto query_file = combine_path(data_prefix, dataset_conf.query_file); + auto gt_file = dataset_conf.groundtruth_neighbors_file; + if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } + auto dataset = std::make_shared>(dataset_conf.name, + base_file, + dataset_conf.subset_first_row, + dataset_conf.subset_size, + query_file, + dataset_conf.distance, + gt_file); + ::benchmark::AddCustomContext("dataset", dataset_conf.name); + ::benchmark::AddCustomContext("distance", dataset_conf.distance); + std::vector indices = conf.get_indices(); + if (build_mode) { + if (file_exists(base_file)) { + log_info("Using the dataset file '%s'", base_file.c_str()); + ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + } else { + log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.", + base_file.c_str()); + } + std::vector more_indices{}; + for (auto& index : indices) { + for (auto param : apply_overrides(index.build_param, override_kv)) { + auto modified_index = index; + modified_index.build_param = param; + modified_index.file = combine_path(index_prefix, modified_index.file); + more_indices.push_back(modified_index); + } + } + register_build(dataset, more_indices, force_overwrite); + } else if (search_mode) { + if (file_exists(query_file)) { + log_info("Using the query file '%s'", query_file.c_str()); + ::benchmark::AddCustomContext("max_n_queries", + std::to_string(dataset->query_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + if (gt_file.has_value()) { + if (file_exists(*gt_file)) { + log_info("Using the ground truth file '%s'", gt_file->c_str()); + ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); + } else { + log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", + gt_file->c_str()); + } + } else { + log_warn( + "Ground truth file is not provided; the recall won't be reported. NB: use " + "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " + "path to " + "the ground truth in your conf.json."); + } + } else { + log_warn("Query file '%s' does not exist; benchmarking search is impossible.", + query_file.c_str()); + } + for (auto& index : indices) { + index.search_params = apply_overrides(index.search_params, override_kv); + index.file = combine_path(index_prefix, index.file); + } + register_search(dataset, indices, metric_objective); + } } - return false; -} -inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool -{ - auto n = strlen(pat); - if (strncmp(pat, arg, strlen(pat)) == 0) { - result = arg + n + 1; - return true; + inline auto parse_bool_flag(const char* arg, const char* pat, bool& result)->bool + { + if (strcmp(arg, pat) == 0) { + result = true; + return true; + } + return false; } - return false; -} -inline auto run_main(int argc, char** argv) -> int -{ - bool force_overwrite = false; - bool build_mode = false; - bool search_mode = false; - std::string data_prefix = "data"; - std::string index_prefix = "index"; - std::string new_override_kv = ""; - std::string mode = "latency"; - kv_series override_kv{}; - - char arg0_default[] = "benchmark"; // NOLINT - char* args_default = arg0_default; - if (!argv) { - argc = 1; - argv = &args_default; - } - if (argc == 1) { - printf_usage(); - return -1; + inline auto parse_string_flag(const char* arg, const char* pat, std::string& result)->bool + { + auto n = strlen(pat); + if (strncmp(pat, arg, strlen(pat)) == 0) { + result = arg + n + 1; + return true; + } + return false; } - char* conf_path = argv[--argc]; - std::ifstream conf_stream(conf_path); - - for (int i = 1; i < argc; i++) { - if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) || - parse_bool_flag(argv[i], "--build", build_mode) || - parse_bool_flag(argv[i], "--search", search_mode) || - parse_string_flag(argv[i], "--data_prefix", data_prefix) || - parse_string_flag(argv[i], "--index_prefix", index_prefix) || - parse_string_flag(argv[i], "--mode", mode) || - parse_string_flag(argv[i], "--override_kv", new_override_kv)) { - if (!new_override_kv.empty()) { - auto kvv = split(new_override_kv, ':'); - auto key = kvv[0]; - std::vector vals{}; - for (std::size_t j = 1; j < kvv.size(); j++) { - vals.push_back(nlohmann::json::parse(kvv[j])); + inline auto run_main(int argc, char** argv)->int + { + bool force_overwrite = false; + bool build_mode = false; + bool search_mode = false; + std::string data_prefix = "data"; + std::string index_prefix = "index"; + std::string new_override_kv = ""; + std::string mode = "latency"; + kv_series override_kv{}; + + char arg0_default[] = "benchmark"; // NOLINT + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + if (argc == 1) { + printf_usage(); + return -1; } - override_kv.emplace_back(key, vals); - new_override_kv = ""; - } - for (int j = i; j < argc - 1; j++) { - argv[j] = argv[j + 1]; - } - argc--; - i--; - } - } - Objective metric_objective = Objective::LATENCY; - if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } + char* conf_path = argv[--argc]; + std::ifstream conf_stream(conf_path); + + for (int i = 1; i < argc; i++) { + if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) || + parse_bool_flag(argv[i], "--build", build_mode) || + parse_bool_flag(argv[i], "--search", search_mode) || + parse_string_flag(argv[i], "--data_prefix", data_prefix) || + parse_string_flag(argv[i], "--index_prefix", index_prefix) || + parse_string_flag(argv[i], "--mode", mode) || + parse_string_flag(argv[i], "--override_kv", new_override_kv)) { + if (!new_override_kv.empty()) { + auto kvv = split(new_override_kv, ':'); + auto key = kvv[0]; + std::vector vals{}; + for (std::size_t j = 1; j < kvv.size(); j++) { + vals.push_back(nlohmann::json::parse(kvv[j])); + } + override_kv.emplace_back(key, vals); + new_override_kv = ""; + } + for (int j = i; j < argc - 1; j++) { + argv[j] = argv[j + 1]; + } + argc--; + i--; + } + } - if (build_mode == search_mode) { - log_error("One and only one of --build and --search should be specified"); - printf_usage(); - return -1; - } + Objective metric_objective = Objective::LATENCY; + if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } - if (!conf_stream) { - log_error("Can't open configuration file: %s", conf_path); - return -1; - } + if (build_mode == search_mode) { + log_error("One and only one of --build and --search should be specified"); + printf_usage(); + return -1; + } - if (cudart.needed() && !cudart.found()) { - log_warn("cudart library is not found, GPU-based indices won't work."); - } + if (!conf_stream) { + log_error("Can't open configuration file: %s", conf_path); + return -1; + } + + if (cudart.needed() && !cudart.found()) { + log_warn("cudart library is not found, GPU-based indices won't work."); + } - Configuration conf(conf_stream); - std::string dtype = conf.get_dataset_conf().dtype; - - if (dtype == "float") { - dispatch_benchmark(conf, - force_overwrite, - build_mode, - search_mode, - data_prefix, - index_prefix, - override_kv, - metric_objective); - } else if (dtype == "uint8") { - dispatch_benchmark(conf, - force_overwrite, - build_mode, - search_mode, - data_prefix, - index_prefix, - override_kv, - metric_objective); - } else if (dtype == "int8") { - dispatch_benchmark(conf, + Configuration conf(conf_stream); + std::string dtype = conf.get_dataset_conf().dtype; + + if (dtype == "float") { + dispatch_benchmark(conf, force_overwrite, build_mode, search_mode, @@ -572,19 +577,37 @@ inline auto run_main(int argc, char** argv) -> int index_prefix, override_kv, metric_objective); - } else { - log_error("datatype '%s' is not supported", dtype.c_str()); - return -1; - } + } else if (dtype == "uint8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); + } else if (dtype == "int8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); + } else { + log_error("datatype '%s' is not supported", dtype.c_str()); + return -1; + } - ::benchmark::Initialize(&argc, argv, printf_usage); - if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; - ::benchmark::RunSpecifiedBenchmarks(); - ::benchmark::Shutdown(); - // Release a possibly cached ANN object, so that it cannot be alive longer than the handle to a - // shared library it depends on (dynamic benchmark executable). - current_algo.reset(); - return 0; -} + ::benchmark::Initialize(&argc, argv, printf_usage); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + // Release a possibly cached ANN object, so that it cannot be alive longer than the handle + // to a shared library it depends on (dynamic benchmark executable). + current_algo.reset(); + return 0; + } }; // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index 81c50f2c2e..66776ee3bc 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -65,7 +65,7 @@ class HnswLib : public ANN { using typename ANN::AnnSearchParam; struct SearchParam : public AnnSearchParam { int ef; - int num_threads = omp_get_num_procs(); + int num_threads = 1; }; HnswLib(Metric metric, int dim, const BuildParam& param); From 7c80266c9c5483a0fe64a0ac3ce2c9e96b3a9896 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 00:16:25 -0400 Subject: [PATCH 06/19] Fixing syntax error --- cpp/bench/ann/src/common/benchmark.hpp | 520 ++++++++++++------------- 1 file changed, 257 insertions(+), 263 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 5c755108af..bd987d4f23 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -284,52 +284,47 @@ void bench_search(::benchmark::State& state, state.SetItemsProcessed(queries_processed); if (cudart.found()) { - state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}, + state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); + } -// // Using gpu_timer.total_time() isn't really the most fair comparison. -// {"GPU QPS", queries_processed / total_time }}); - } + // This will be the total number of queries across all threads + state.counters.insert({{"total_queries", queries_processed}}); - // This will be the total number of queries across all threads - state.counters.insert({{"total_queries", queries_processed}}); - - if (state.skipped()) { - return; } - - if (state.thread_index() == 0) { - state.counters.insert({{"k", k}, {"n_queries", n_queries}}); - - // evaluate recall - if (dataset->max_k() >= k) { - const std::int32_t* gt = dataset->gt_set(); - const std::uint32_t max_k = dataset->max_k(); - buf neighbors_host = neighbors->move(MemoryType::Host); - std::size_t rows = std::min(queries_processed, query_set_size); - std::size_t match_count = 0; - std::size_t total_count = rows * static_cast(k); - for (std::size_t i = 0; i < rows; i++) { - for (std::uint32_t j = 0; j < k; j++) { - auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); - for (std::uint32_t l = 0; l < k; l++) { - auto exp_idx = gt[i * max_k + l]; - if (act_idx == exp_idx) { - match_count++; - break; - } + if (state.skipped()) { return; } + + if (state.thread_index() == 0) { + state.counters.insert({{"k", k}, {"n_queries", n_queries}}); + + // evaluate recall + if (dataset->max_k() >= k) { + const std::int32_t* gt = dataset->gt_set(); + const std::uint32_t max_k = dataset->max_k(); + buf neighbors_host = neighbors->move(MemoryType::Host); + std::size_t rows = std::min(queries_processed, query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + for (std::size_t i = 0; i < rows; i++) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; } } } - double actual_recall = static_cast(match_count) / static_cast(total_count); - state.counters.insert({{"Recall", actual_recall}}); } + double actual_recall = static_cast(match_count) / static_cast(total_count); + state.counters.insert({{"Recall", actual_recall}}); } } +} - inline void printf_usage() - { - ::benchmark::PrintDefaultHelp(); - fprintf( - stdout, +inline void printf_usage() +{ + ::benchmark::PrintDefaultHelp(); + fprintf(stdout, " [--build|--search] \n" " [--overwrite]\n" " [--data_prefix=]\n" @@ -356,220 +351,237 @@ void bench_search(::benchmark::State& state, " --mode=" " run the benchmarks in latency (accumulate times spent in each batch) or " " throughput (pipeline batches and measure end-to-end) mode\n"); - } +} - template - void register_build(std::shared_ptr> dataset, - std::vector indices, - bool force_overwrite) - { - for (auto index : indices) { - auto suf = static_cast(index.build_param["override_suffix"]); - auto file_suf = suf; - index.build_param.erase("override_suffix"); - std::replace(file_suf.begin(), file_suf.end(), '/', '-'); - index.file += file_suf; - auto* b = ::benchmark::RegisterBenchmark( - index.name + suf, bench_build, dataset, index, force_overwrite); - b->Unit(benchmark::kSecond); - b->UseRealTime(); - } +template +void register_build(std::shared_ptr> dataset, + std::vector indices, + bool force_overwrite) +{ + for (auto index : indices) { + auto suf = static_cast(index.build_param["override_suffix"]); + auto file_suf = suf; + index.build_param.erase("override_suffix"); + std::replace(file_suf.begin(), file_suf.end(), '/', '-'); + index.file += file_suf; + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_build, dataset, index, force_overwrite); + b->Unit(benchmark::kSecond); + b->UseRealTime(); } +} - template - void register_search(std::shared_ptr> dataset, - std::vector indices, - Objective metric_objective) - { - for (auto index : indices) { - for (std::size_t i = 0; i < index.search_params.size(); i++) { - auto suf = static_cast(index.search_params[i]["override_suffix"]); - index.search_params[i].erase("override_suffix"); - - auto* b = ::benchmark::RegisterBenchmark( - index.name + suf, bench_search, index, i, dataset, metric_objective) - ->Unit(benchmark::kMillisecond) - ->ThreadRange(1, 32) - ->UseManualTime(); - } - } +template +void register_search(std::shared_ptr> dataset, + std::vector indices, + Objective metric_objective) +{ + for (auto index : indices) { + for (std::size_t i = 0; i < index.search_params.size(); i++) { + auto suf = static_cast(index.search_params[i]["override_suffix"]); + index.search_params[i].erase("override_suffix"); + + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, index, i, dataset, metric_objective) + ->Unit(benchmark::kMillisecond) + ->ThreadRange(1, 32) + ->UseManualTime(); + } } +} - template - void dispatch_benchmark(const Configuration& conf, - bool force_overwrite, - bool build_mode, - bool search_mode, - std::string data_prefix, - std::string index_prefix, - kv_series override_kv, - Objective metric_objective) - { - if (cudart.found()) { - for (auto [key, value] : cuda_info()) { - ::benchmark::AddCustomContext(key, value); - } - } - const auto dataset_conf = conf.get_dataset_conf(); - auto base_file = combine_path(data_prefix, dataset_conf.base_file); - auto query_file = combine_path(data_prefix, dataset_conf.query_file); - auto gt_file = dataset_conf.groundtruth_neighbors_file; - if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } - auto dataset = std::make_shared>(dataset_conf.name, - base_file, - dataset_conf.subset_first_row, - dataset_conf.subset_size, - query_file, - dataset_conf.distance, - gt_file); - ::benchmark::AddCustomContext("dataset", dataset_conf.name); - ::benchmark::AddCustomContext("distance", dataset_conf.distance); - std::vector indices = conf.get_indices(); - if (build_mode) { - if (file_exists(base_file)) { - log_info("Using the dataset file '%s'", base_file.c_str()); - ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); - ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); - } else { - log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.", - base_file.c_str()); - } - std::vector more_indices{}; - for (auto& index : indices) { - for (auto param : apply_overrides(index.build_param, override_kv)) { - auto modified_index = index; - modified_index.build_param = param; - modified_index.file = combine_path(index_prefix, modified_index.file); - more_indices.push_back(modified_index); - } - } - register_build(dataset, more_indices, force_overwrite); - } else if (search_mode) { - if (file_exists(query_file)) { - log_info("Using the query file '%s'", query_file.c_str()); - ::benchmark::AddCustomContext("max_n_queries", - std::to_string(dataset->query_set_size())); - ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); - if (gt_file.has_value()) { - if (file_exists(*gt_file)) { - log_info("Using the ground truth file '%s'", gt_file->c_str()); - ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); - } else { - log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", - gt_file->c_str()); - } - } else { - log_warn( - "Ground truth file is not provided; the recall won't be reported. NB: use " - "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " - "path to " - "the ground truth in your conf.json."); - } - } else { - log_warn("Query file '%s' does not exist; benchmarking search is impossible.", - query_file.c_str()); - } - for (auto& index : indices) { - index.search_params = apply_overrides(index.search_params, override_kv); - index.file = combine_path(index_prefix, index.file); - } - register_search(dataset, indices, metric_objective); +template +void dispatch_benchmark(const Configuration& conf, + bool force_overwrite, + bool build_mode, + bool search_mode, + std::string data_prefix, + std::string index_prefix, + kv_series override_kv, + Objective metric_objective) +{ + if (cudart.found()) { + for (auto [key, value] : cuda_info()) { + ::benchmark::AddCustomContext(key, value); + } + } + const auto dataset_conf = conf.get_dataset_conf(); + auto base_file = combine_path(data_prefix, dataset_conf.base_file); + auto query_file = combine_path(data_prefix, dataset_conf.query_file); + auto gt_file = dataset_conf.groundtruth_neighbors_file; + if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); } + auto dataset = std::make_shared>(dataset_conf.name, + base_file, + dataset_conf.subset_first_row, + dataset_conf.subset_size, + query_file, + dataset_conf.distance, + gt_file); + ::benchmark::AddCustomContext("dataset", dataset_conf.name); + ::benchmark::AddCustomContext("distance", dataset_conf.distance); + std::vector indices = conf.get_indices(); + if (build_mode) { + if (file_exists(base_file)) { + log_info("Using the dataset file '%s'", base_file.c_str()); + ::benchmark::AddCustomContext("n_records", std::to_string(dataset->base_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + } else { + log_warn("Dataset file '%s' does not exist; benchmarking index building is impossible.", + base_file.c_str()); + } + std::vector more_indices{}; + for (auto& index : indices) { + for (auto param : apply_overrides(index.build_param, override_kv)) { + auto modified_index = index; + modified_index.build_param = param; + modified_index.file = combine_path(index_prefix, modified_index.file); + more_indices.push_back(modified_index); + } + } + register_build(dataset, more_indices, force_overwrite); + } else if (search_mode) { + if (file_exists(query_file)) { + log_info("Using the query file '%s'", query_file.c_str()); + ::benchmark::AddCustomContext("max_n_queries", std::to_string(dataset->query_set_size())); + ::benchmark::AddCustomContext("dim", std::to_string(dataset->dim())); + if (gt_file.has_value()) { + if (file_exists(*gt_file)) { + log_info("Using the ground truth file '%s'", gt_file->c_str()); + ::benchmark::AddCustomContext("max_k", std::to_string(dataset->max_k())); + } else { + log_warn("Ground truth file '%s' does not exist; the recall won't be reported.", + gt_file->c_str()); } + } else { + log_warn( + "Ground truth file is not provided; the recall won't be reported. NB: use " + "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " + "path to " + "the ground truth in your conf.json."); + } + } else { + log_warn("Query file '%s' does not exist; benchmarking search is impossible.", + query_file.c_str()); + } + for (auto& index : indices) { + index.search_params = apply_overrides(index.search_params, override_kv); + index.file = combine_path(index_prefix, index.file); + } + register_search(dataset, indices, metric_objective); } +} - inline auto parse_bool_flag(const char* arg, const char* pat, bool& result)->bool - { - if (strcmp(arg, pat) == 0) { - result = true; - return true; - } - return false; +inline auto parse_bool_flag(const char* arg, const char* pat, bool& result) -> bool +{ + if (strcmp(arg, pat) == 0) { + result = true; + return true; } + return false; +} - inline auto parse_string_flag(const char* arg, const char* pat, std::string& result)->bool - { - auto n = strlen(pat); - if (strncmp(pat, arg, strlen(pat)) == 0) { - result = arg + n + 1; - return true; - } - return false; +inline auto parse_string_flag(const char* arg, const char* pat, std::string& result) -> bool +{ + auto n = strlen(pat); + if (strncmp(pat, arg, strlen(pat)) == 0) { + result = arg + n + 1; + return true; } + return false; +} - inline auto run_main(int argc, char** argv)->int - { - bool force_overwrite = false; - bool build_mode = false; - bool search_mode = false; - std::string data_prefix = "data"; - std::string index_prefix = "index"; - std::string new_override_kv = ""; - std::string mode = "latency"; - kv_series override_kv{}; - - char arg0_default[] = "benchmark"; // NOLINT - char* args_default = arg0_default; - if (!argv) { - argc = 1; - argv = &args_default; - } - if (argc == 1) { - printf_usage(); - return -1; - } +inline auto run_main(int argc, char** argv) -> int +{ + bool force_overwrite = false; + bool build_mode = false; + bool search_mode = false; + std::string data_prefix = "data"; + std::string index_prefix = "index"; + std::string new_override_kv = ""; + std::string mode = "latency"; + kv_series override_kv{}; + + char arg0_default[] = "benchmark"; // NOLINT + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + if (argc == 1) { + printf_usage(); + return -1; + } - char* conf_path = argv[--argc]; - std::ifstream conf_stream(conf_path); - - for (int i = 1; i < argc; i++) { - if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) || - parse_bool_flag(argv[i], "--build", build_mode) || - parse_bool_flag(argv[i], "--search", search_mode) || - parse_string_flag(argv[i], "--data_prefix", data_prefix) || - parse_string_flag(argv[i], "--index_prefix", index_prefix) || - parse_string_flag(argv[i], "--mode", mode) || - parse_string_flag(argv[i], "--override_kv", new_override_kv)) { - if (!new_override_kv.empty()) { - auto kvv = split(new_override_kv, ':'); - auto key = kvv[0]; - std::vector vals{}; - for (std::size_t j = 1; j < kvv.size(); j++) { - vals.push_back(nlohmann::json::parse(kvv[j])); - } - override_kv.emplace_back(key, vals); - new_override_kv = ""; - } - for (int j = i; j < argc - 1; j++) { - argv[j] = argv[j + 1]; - } - argc--; - i--; - } + char* conf_path = argv[--argc]; + std::ifstream conf_stream(conf_path); + + for (int i = 1; i < argc; i++) { + if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) || + parse_bool_flag(argv[i], "--build", build_mode) || + parse_bool_flag(argv[i], "--search", search_mode) || + parse_string_flag(argv[i], "--data_prefix", data_prefix) || + parse_string_flag(argv[i], "--index_prefix", index_prefix) || + parse_string_flag(argv[i], "--mode", mode) || + parse_string_flag(argv[i], "--override_kv", new_override_kv)) { + if (!new_override_kv.empty()) { + auto kvv = split(new_override_kv, ':'); + auto key = kvv[0]; + std::vector vals{}; + for (std::size_t j = 1; j < kvv.size(); j++) { + vals.push_back(nlohmann::json::parse(kvv[j])); } + override_kv.emplace_back(key, vals); + new_override_kv = ""; + } + for (int j = i; j < argc - 1; j++) { + argv[j] = argv[j + 1]; + } + argc--; + i--; + } + } - Objective metric_objective = Objective::LATENCY; - if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } - - if (build_mode == search_mode) { - log_error("One and only one of --build and --search should be specified"); - printf_usage(); - return -1; - } + Objective metric_objective = Objective::LATENCY; + if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } - if (!conf_stream) { - log_error("Can't open configuration file: %s", conf_path); - return -1; - } + if (build_mode == search_mode) { + log_error("One and only one of --build and --search should be specified"); + printf_usage(); + return -1; + } - if (cudart.needed() && !cudart.found()) { - log_warn("cudart library is not found, GPU-based indices won't work."); - } + if (!conf_stream) { + log_error("Can't open configuration file: %s", conf_path); + return -1; + } - Configuration conf(conf_stream); - std::string dtype = conf.get_dataset_conf().dtype; + if (cudart.needed() && !cudart.found()) { + log_warn("cudart library is not found, GPU-based indices won't work."); + } - if (dtype == "float") { - dispatch_benchmark(conf, + Configuration conf(conf_stream); + std::string dtype = conf.get_dataset_conf().dtype; + + if (dtype == "float") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); + } else if (dtype == "uint8") { + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective); + } else if (dtype == "int8") { + dispatch_benchmark(conf, force_overwrite, build_mode, search_mode, @@ -577,37 +589,19 @@ void bench_search(::benchmark::State& state, index_prefix, override_kv, metric_objective); - } else if (dtype == "uint8") { - dispatch_benchmark(conf, - force_overwrite, - build_mode, - search_mode, - data_prefix, - index_prefix, - override_kv, - metric_objective); - } else if (dtype == "int8") { - dispatch_benchmark(conf, - force_overwrite, - build_mode, - search_mode, - data_prefix, - index_prefix, - override_kv, - metric_objective); - } else { - log_error("datatype '%s' is not supported", dtype.c_str()); - return -1; - } - - ::benchmark::Initialize(&argc, argv, printf_usage); - if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; - ::benchmark::RunSpecifiedBenchmarks(); - ::benchmark::Shutdown(); - // Release a possibly cached ANN object, so that it cannot be alive longer than the handle - // to a shared library it depends on (dynamic benchmark executable). - current_algo.reset(); - return 0; + } else { + log_error("datatype '%s' is not supported", dtype.c_str()); + return -1; } + ::benchmark::Initialize(&argc, argv, printf_usage); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + // Release a possibly cached ANN object, so that it cannot be alive longer than the handle + // to a shared library it depends on (dynamic benchmark executable). + current_algo.reset(); + return 0; +} + }; // namespace raft::bench::ann From adf2f44653bc3ea9e156956fa792353b89bdb6ac Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 01:02:58 -0400 Subject: [PATCH 07/19] Minor things --- cpp/bench/ann/src/common/benchmark.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index bd987d4f23..9a58a5f6eb 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -254,12 +254,13 @@ void bench_search(::benchmark::State& state, { nvtx_case nvtx{state.name()}; + // TODO: Have the odd threads load the queries backwards just to rule out caching. + ANN* algo = dynamic_cast*>(current_algo.get()); for (auto _ : state) { [[maybe_unused]] auto ntx_lap = nvtx.lap(); [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); - ANN* algo = dynamic_cast*>(current_algo.get()); - auto start = std::chrono::high_resolution_clock::now(); + auto start = std::chrono::high_resolution_clock::now(); // run the search try { algo->search(query_set + batch_offset * dataset->dim(), @@ -272,7 +273,8 @@ void bench_search(::benchmark::State& state, state.SkipWithError(std::string(e.what())); } - auto end = std::chrono::high_resolution_clock::now(); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = std::chrono::duration_cast>(end - start); // advance to the next batch batch_offset = (batch_offset + n_queries) % query_set_size; @@ -292,7 +294,8 @@ void bench_search(::benchmark::State& state, if (state.skipped()) { return; } - if (state.thread_index() == 0) { + // Use the last thread as a sanity check that all the threads are working. + if (state.thread_index() == state.threads() - 1) { state.counters.insert({{"k", k}, {"n_queries", n_queries}}); // evaluate recall From 89786ff02f22ba26bb583d93e3479f51cfe488c2 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 09:38:30 -0400 Subject: [PATCH 08/19] Removing cudart_utils from benchmark.hpp --- cpp/bench/ann/src/common/benchmark.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 9a58a5f6eb..4059e723bf 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -20,8 +20,6 @@ #include "dataset.hpp" #include "util.hpp" -#include - #include #include From 4573050c3b087b7584e936fced154ade64aaa4d4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 11:39:33 -0400 Subject: [PATCH 09/19] Removing all cuda includes from benchmark.hpp --- cpp/bench/ann/src/common/benchmark.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 4059e723bf..a83adffdef 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -22,9 +22,6 @@ #include -#include - -#include "thread_pool.hpp" #include #include #include @@ -382,11 +379,14 @@ void register_search(std::shared_ptr> dataset, auto suf = static_cast(index.search_params[i]["override_suffix"]); index.search_params[i].erase("override_suffix"); + int max_threads = + metric_objective == Objective::THROUGHPUT ? std::thread::hardware_concurrency() : 1; + auto* b = ::benchmark::RegisterBenchmark( index.name + suf, bench_search, index, i, dataset, metric_objective) ->Unit(benchmark::kMillisecond) - ->ThreadRange(1, 32) - ->UseManualTime(); + ->UseManualTime() + ->ThreadRange(1, max_threads); } } } From 392f6a19452bf20d448b7f2c0ce3b571ebb46a96 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 16:19:04 -0400 Subject: [PATCH 10/19] Adding `end_to_end` time and verifying the total_queries / end_to_end ~ items_per_second.:qqx --- cpp/bench/ann/src/common/benchmark.hpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index a83adffdef..7b9f8e4208 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -245,6 +245,7 @@ void bench_search(::benchmark::State& state, std::shared_ptr> neighbors = std::make_shared>(algo_property.query_memory_type, k * query_set_size); + auto start = std::chrono::high_resolution_clock::now(); cuda_timer gpu_timer; { nvtx_case nvtx{state.name()}; @@ -278,7 +279,11 @@ void bench_search(::benchmark::State& state, total_time += elapsed_seconds.count(); } } - + auto end = std::chrono::high_resolution_clock::now(); + if (state.thread_index() == 0) { + auto duration = std::chrono::duration_cast>(end - start).count(); + state.counters.insert({{"end_to_end", duration}}); + } state.SetItemsProcessed(queries_processed); if (cudart.found()) { state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); @@ -291,7 +296,7 @@ void bench_search(::benchmark::State& state, // Use the last thread as a sanity check that all the threads are working. if (state.thread_index() == state.threads() - 1) { - state.counters.insert({{"k", k}, {"n_queries", n_queries}}); + // state.counters.insert({{"k", k}, {"n_queries", n_queries}}); // evaluate recall if (dataset->max_k() >= k) { @@ -385,8 +390,17 @@ void register_search(std::shared_ptr> dataset, auto* b = ::benchmark::RegisterBenchmark( index.name + suf, bench_search, index, i, dataset, metric_objective) ->Unit(benchmark::kMillisecond) - ->UseManualTime() - ->ThreadRange(1, max_threads); + ->ThreadRange(1, max_threads) + + /** + * The following are important for getting accuracy QPS measurements on both CPU + * and GPU These make sure that + * - `items_per_second` ~ (`total_queries` / `end_to_end`) + * - `end_to_end` ~ (`Time` * `Iterations`) + * - + */ + ->MeasureProcessCPUTime() + ->UseRealTime() } } } From 18b57ccd8f02da0bf49ed269c6a111a5abf03ff6 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 23 Oct 2023 18:05:35 -0400 Subject: [PATCH 11/19] More updates --- cpp/bench/ann/src/common/benchmark.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 7b9f8e4208..4feb2ab849 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -246,15 +246,15 @@ void bench_search(::benchmark::State& state, std::make_shared>(algo_property.query_memory_type, k * query_set_size); auto start = std::chrono::high_resolution_clock::now(); - cuda_timer gpu_timer; + // cuda_timer gpu_timer; { - nvtx_case nvtx{state.name()}; + // nvtx_case nvtx{state.name()}; // TODO: Have the odd threads load the queries backwards just to rule out caching. ANN* algo = dynamic_cast*>(current_algo.get()); for (auto _ : state) { - [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + // [[maybe_unused]] auto ntx_lap = nvtx.lap(); + // [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); auto start = std::chrono::high_resolution_clock::now(); // run the search @@ -264,7 +264,7 @@ void bench_search(::benchmark::State& state, k, neighbors->data + batch_offset * k, distances->data + batch_offset * k, - gpu_timer.stream()); + cudaStreamPerThread); } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } @@ -285,9 +285,9 @@ void bench_search(::benchmark::State& state, state.counters.insert({{"end_to_end", duration}}); } state.SetItemsProcessed(queries_processed); - if (cudart.found()) { - state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); - } + // if (cudart.found()) { + // state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); + // } // This will be the total number of queries across all threads state.counters.insert({{"total_queries", queries_processed}}); @@ -395,12 +395,12 @@ void register_search(std::shared_ptr> dataset, /** * The following are important for getting accuracy QPS measurements on both CPU * and GPU These make sure that - * - `items_per_second` ~ (`total_queries` / `end_to_end`) * - `end_to_end` ~ (`Time` * `Iterations`) - * - + * - `items_per_second` ~ (`total_queries` / `end_to_end`) + * - `Time` = `end_to_end` / `Iterations` */ ->MeasureProcessCPUTime() - ->UseRealTime() + ->UseRealTime(); } } } From bda392f7bce440de9c121eeae2a0fa0d4c3c0328 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 25 Oct 2023 19:53:34 -0400 Subject: [PATCH 12/19] Removing cudaStreamPerThread from benchmark.hpp --- cpp/bench/ann/CMakeLists.txt | 18 +++++++----------- cpp/bench/ann/src/common/benchmark.hpp | 23 ++++++++++++----------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 1f98cdbe88..d6a5fddb98 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -15,22 +15,18 @@ # ################################################################################################## # * benchmark options ------------------------------------------------------------------------------ -option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" - OFF -) -option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" OFF) -option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" OFF) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT - "Include faiss' cpu brute-force knn algorithm in benchmark" OFF -) -option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" - OFF + "Include faiss' cpu brute-force knn algorithm in benchmark" ON ) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark" - OFF + ON ) -option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" OFF) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 4feb2ab849..b1c63b26d6 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -187,7 +187,7 @@ void bench_search(::benchmark::State& state, const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries; if (!file_exists(index.file)) { - throw std::runtime_error("Index file is missing. Run the benchmark in the build mode first."); + state.SkipWithError("Index file is missing. Run the benchmark in the build mode first."); return; } @@ -246,15 +246,15 @@ void bench_search(::benchmark::State& state, std::make_shared>(algo_property.query_memory_type, k * query_set_size); auto start = std::chrono::high_resolution_clock::now(); - // cuda_timer gpu_timer; + cuda_timer gpu_timer; { - // nvtx_case nvtx{state.name()}; + nvtx_case nvtx{state.name()}; // TODO: Have the odd threads load the queries backwards just to rule out caching. ANN* algo = dynamic_cast*>(current_algo.get()); for (auto _ : state) { - // [[maybe_unused]] auto ntx_lap = nvtx.lap(); - // [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); auto start = std::chrono::high_resolution_clock::now(); // run the search @@ -264,7 +264,7 @@ void bench_search(::benchmark::State& state, k, neighbors->data + batch_offset * k, distances->data + batch_offset * k, - cudaStreamPerThread); + gpu_timer.stream()); } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } @@ -285,9 +285,9 @@ void bench_search(::benchmark::State& state, state.counters.insert({{"end_to_end", duration}}); } state.SetItemsProcessed(queries_processed); - // if (cudart.found()) { - // state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); - // } + if (cudart.found()) { + state.counters.insert({{"GPU", gpu_timer.total_time() / double(state.iterations())}}); + } // This will be the total number of queries across all threads state.counters.insert({{"total_queries", queries_processed}}); @@ -296,8 +296,6 @@ void bench_search(::benchmark::State& state, // Use the last thread as a sanity check that all the threads are working. if (state.thread_index() == state.threads() - 1) { - // state.counters.insert({{"k", k}, {"n_queries", n_queries}}); - // evaluate recall if (dataset->max_k() >= k) { const std::int32_t* gt = dataset->gt_set(); @@ -398,6 +396,9 @@ void register_search(std::shared_ptr> dataset, * - `end_to_end` ~ (`Time` * `Iterations`) * - `items_per_second` ~ (`total_queries` / `end_to_end`) * - `Time` = `end_to_end` / `Iterations` + * + * - Latency = `Time` + * - Throughput = `items_per_second` */ ->MeasureProcessCPUTime() ->UseRealTime(); From a21f66876e5bcf0bfe82d1a8e29504890bbf1a70 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 11:57:02 -0400 Subject: [PATCH 13/19] Removing thread pool. It's not being used --- cpp/bench/ann/src/common/thread_pool.hpp | 65 ------------------------ 1 file changed, 65 deletions(-) diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp index c28b315919..7eb6d88b96 100644 --- a/cpp/bench/ann/src/common/thread_pool.hpp +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -133,68 +133,3 @@ class FixedThreadPool { std::vector threads_; std::atomic finished_{false}; }; - -class ThreadPool { - public: - ThreadPool(unsigned num_threads = std::thread::hardware_concurrency()) - { - if (num_threads <= 0) { return; } - while (num_threads--) { - threads.emplace_back([this] { - while (true) { - std::unique_lock lock(mutex); - condvar.wait(lock, [this] { return !queue.empty(); }); - auto task = std::move(queue.front()); - if (task.valid()) { - queue.pop(); - lock.unlock(); - // run the task - this cannot throw; any exception - // will be stored in the corresponding future - task(); - } else { - // an empty task is used to signal end of stream - // don't pop it off the top; all threads need to see it - break; - } - } - }); - } - } - - template > - std::future run(F&& f) const - { - auto task = std::packaged_task(std::forward(f)); - auto future = task.get_future(); - { - std::lock_guard lock(mutex); - // conversion to packaged_task erases the return type - // so it can be stored in the queue. the future will still - // contain the correct type - queue.push(std::packaged_task(std::move(task))); - } - condvar.notify_one(); - return future; - } - - ~ThreadPool() - { - // push a single empty task onto the queue and notify all threads, - // then wait for them to terminate - - { - std::lock_guard lock(mutex); - queue.push({}); - } - condvar.notify_all(); - for (auto& thread : threads) { - thread.join(); - } - } - - private: - std::vector threads; - mutable std::queue> queue; - mutable std::mutex mutex; - mutable std::condition_variable condvar; -}; From f592eb60da5c2563e399766f45a9d884faea911e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 12:42:25 -0400 Subject: [PATCH 14/19] Adding new mode to docs --- cpp/bench/ann/src/common/benchmark.hpp | 1 - cpp/bench/ann/src/common/thread_pool.hpp | 2 -- docs/source/raft_ann_benchmarks.md | 4 ++++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index b1c63b26d6..ec52013919 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp index 7eb6d88b96..c01fa2c32c 100644 --- a/cpp/bench/ann/src/common/thread_pool.hpp +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -16,12 +16,10 @@ #pragma once #include -#include #include #include #include #include -#include #include #include #include diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index fadca595fb..4cbdef2652 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -141,6 +141,10 @@ options: run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) -f, --force re-run algorithms even if their results already exist (default: False) + -m MODE, --search_mode MODE + run search in 'latency' (measure individual batches) or + 'throughput' (pipeline batches and measure end-to-end) mode. + (default: 'latency') ``` `configuration` and `dataset` : `configuration` is a path to a configuration file for a given dataset. From ab6e0d03fd569a877e4c1beae5fc25fec27a4480 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 12:43:01 -0400 Subject: [PATCH 15/19] Fixing "search-mode" option --- docs/source/raft_ann_benchmarks.md | 2 +- python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 4cbdef2652..1e3958b30f 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -141,7 +141,7 @@ options: run only comma separated list of named algorithms (default: None) --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) -f, --force re-run algorithms even if their results already exist (default: False) - -m MODE, --search_mode MODE + -m MODE, --search-mode MODE run search in 'latency' (measure individual batches) or 'throughput' (pipeline batches and measure end-to-end) mode. (default: 'latency') diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index e0583d7708..75b7734349 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -216,7 +216,7 @@ def main(): parser.add_argument( "-m", - "--search_mode", + "--search-mode", help="run search in 'latency' (measure individual batches) or " "'throughput' (pipeline batches and measure end-to-end) mode", default="throughput", From aec2664d48fb3b045f888541bf64a5e1e63f9de1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 13:28:01 -0400 Subject: [PATCH 16/19] Updates based on review feedback --- cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 5 +++-- cpp/bench/ann/src/raft/raft_cagra_wrapper.h | 11 ++--------- cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h | 13 +++---------- cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h | 14 ++------------ 4 files changed, 10 insertions(+), 33 deletions(-) diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index 66776ee3bc..23cae6352c 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -164,8 +164,9 @@ void HnswLib::set_search_param(const AnnSearchParam& param_) appr_alg_->ef_ = param.ef; metric_objective_ = param.metric_objective; - if (metric_objective_ != Objective::LATENCY && - (!thread_pool_ || num_threads_ != param.num_threads)) { + bool use_pool = (metric_objective_ == Objective::LATENCY && param.num_threads > 1) && + (!thread_pool_ || num_threads_ != param.num_threads); + if (use_pool) { num_threads_ = param.num_threads; thread_pool_ = std::make_unique(num_threads_); } diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index a22cb41d46..f1c8154b7c 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -53,18 +53,13 @@ class RaftCagra : public ANN { using BuildParam = raft::neighbors::cagra::index_params; RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) - : ANN(metric, dim), - index_params_(param), - dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull), - handle_(cudaStreamPerThread) + : ANN(metric, dim), index_params_(param), dimension_(dim), handle_(cudaStreamPerThread) { - rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } - ~RaftCagra() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + ~RaftCagra() noexcept {} void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -93,8 +88,6 @@ class RaftCagra : public ANN { void load(const std::string&) override; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; BuildParam index_params_; raft::neighbors::cagra::search_params search_params_; diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index b6df7de068..24b3c69bb6 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -52,18 +52,14 @@ class RaftIvfFlatGpu : public ANN { using BuildParam = raft::neighbors::ivf_flat::index_params; RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param) - : ANN(metric, dim), - index_params_(param), - dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + : ANN(metric, dim), index_params_(param), dimension_(dim) { index_params_.metric = parse_metric_type(metric); index_params_.conservative_memory_allocation = true; - rmm::mr::set_current_device_resource(&mr_); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } - ~RaftIvfFlatGpu() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + ~RaftIvfFlatGpu() noexcept {} void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -90,8 +86,6 @@ class RaftIvfFlatGpu : public ANN { void load(const std::string&) override; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; BuildParam index_params_; raft::neighbors::ivf_flat::search_params search_params_; @@ -134,10 +128,9 @@ template void RaftIvfFlatGpu::search( const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const { - rmm::mr::device_memory_resource* mr_ptr = &const_cast(this)->mr_; static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t"); raft::neighbors::ivf_flat::search( - handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr); + handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances); resource::sync_stream(handle_); return; } diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 1b74dcf975..e4004b0007 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -54,22 +54,14 @@ class RaftIvfPQ : public ANN { using BuildParam = raft::neighbors::ivf_pq::index_params; RaftIvfPQ(Metric metric, int dim, const BuildParam& param) - : ANN(metric, dim), - index_params_(param), - dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + : ANN(metric, dim), index_params_(param), dimension_(dim) { - rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); } - ~RaftIvfPQ() noexcept - { - RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); - rmm::mr::set_current_device_resource(mr_.get_upstream()); - } + ~RaftIvfPQ() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -97,8 +89,6 @@ class RaftIvfPQ : public ANN { void load(const std::string&) override; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; cudaEvent_t sync_{nullptr}; BuildParam index_params_; From 3b8eb2262e2f4dcc175b959152464db9af26b345 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 13:28:33 -0400 Subject: [PATCH 17/19] Apply suggestions from code review Co-authored-by: Ben Frederickson --- cpp/bench/ann/src/common/ann_types.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp index f82afb35e9..2c1105a272 100644 --- a/cpp/bench/ann/src/common/ann_types.hpp +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -26,7 +26,7 @@ namespace raft::bench::ann { enum Objective { THROUGHPUT, // See how many vectors we can push through - LATENCY // See how many + LATENCY // See how fast we can push a vector through }; enum class MemoryType { From 92f140320121427d3d51f921863fd8a96e5330cd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 14:06:00 -0400 Subject: [PATCH 18/19] Update python/raft-ann-bench/src/raft-ann-bench/run/__main__.py Co-authored-by: Ben Frederickson --- python/raft-ann-bench/src/raft-ann-bench/run/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index 75b7734349..32517494cb 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -121,7 +121,6 @@ def run_build_and_search( "--search", "--data_prefix=" + dataset_path, "--benchmark_counters_tabular", - # "--benchmark_min_time=1x", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, "--benchmark_min_warmup_time=0.01", From b813d19a9c11a6cd923323327dd7fb22ff5bdf20 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Oct 2023 16:18:01 -0400 Subject: [PATCH 19/19] Updating docs --- cpp/bench/ann/src/common/benchmark.hpp | 3 +- docs/source/raft_ann_benchmarks.md | 33 +++++++++++++++++++ .../src/raft-ann-bench/run/__main__.py | 2 +- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index ec52013919..10a256bd63 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -155,7 +155,7 @@ void bench_build(::benchmark::State& state, } } state.counters.insert( - {{"GPU Time", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}}); + {{"GPU", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}}); if (state.skipped()) { return; } make_sure_parent_dir_exists(index.file); @@ -367,6 +367,7 @@ void register_build(std::shared_ptr> dataset, auto* b = ::benchmark::RegisterBenchmark( index.name + suf, bench_build, dataset, index, force_overwrite); b->Unit(benchmark::kSecond); + b->MeasureProcessCPUTime(); b->UseRealTime(); } } diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 1e3958b30f..6a436a7213 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -16,6 +16,7 @@ This project provides a benchmark program for various ANN search implementations - [End to end: small-scale (<1M to 10M)](#end-to-end-small-scale-benchmarks-1m-to-10m) - [End to end: large-scale (>10M)](#end-to-end-large-scale-benchmarks-10m-vectors) - [Running with Docker containers](#running-with-docker-containers) + - [Evaluating the results](#evaluating-the-results) - [Creating and customizing dataset configurations](#creating-and-customizing-dataset-configurations) - [Adding a new ANN algorithm](#adding-a-new-ann-algorithm) - [Parameter tuning guide](https://docs.rapids.ai/api/raft/nightly/ann_benchmarks_param_tuning/) @@ -359,6 +360,38 @@ This will drop you into a command line in the container, with the `raft-ann-benc Additionally, the containers can be run in detached mode without any issue. + +### Evaluating the results + +The benchmarks capture several different measurements. The table below describes each of the measurements for index build benchmarks: + +| Name | Description | +|------------|--------------------------------------------------------| +| Benchmark | A name that uniquely identifies the benchmark instance | +| Time | Wall-time spent training the index | +| CPU | CPU time spent training the index | +| Iterations | Number of iterations (this is usually 1) | +| GPU | GPU time spent building | +| index_size | Number of vectors used to train index | + + +The table below describes each of the measurements for the index search benchmarks: + +| Name | Description | +|------|-------------------------------------------------------------------------------------------------------------------------------------------------------| +| Benchmark | A name that uniquely identifies the benchmark instance | +| Time | The average runtime for each batch. This is approximately `end_to_end` / `Iterations` | +| CPU | The average `wall-time`. In `throughput` mode, this is the average `wall-time` spent in each thread. | +| Iterations | Total number of batches. This is going to be `total_queres` / `n_queries` | +| Recall | Proportion of correct neighbors to ground truth neighbors. Note this column is only present if groundtruth file is specified in dataset configuration | +| items_per_second | Total throughput. This is approximately `total_queries` / `end_to_end`. | +| k | Number of neighbors being queried in each iteration | +| end_to_end | Total time taken to run all batches for all iterations | +| n_queries | Total number of query vectors in each batch | +| total_queries | Total number of vectors queries across all iterations | + +Note that the actual table displayed on the screen may differ slightly as the hyper-parameters will also be displayed for each different combination being benchmarked. + ## Creating and customizing dataset configurations A single configuration file will often define a set of algorithms, with associated index and search parameters, for a specific dataset. A configuration file uses json format with 4 major parts: diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py index 75b7734349..30d642f3ac 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py @@ -105,6 +105,7 @@ def run_build_and_search( "--build", "--data_prefix=" + dataset_path, "--benchmark_out_format=json", + "--benchmark_counters_tabular=true", "--benchmark_out=" + f"{os.path.join(build_folder, f'{algo}.json')}", ] @@ -121,7 +122,6 @@ def run_build_and_search( "--search", "--data_prefix=" + dataset_path, "--benchmark_counters_tabular", - # "--benchmark_min_time=1x", "--override_kv=k:%s" % k, "--override_kv=n_queries:%s" % batch_size, "--benchmark_min_warmup_time=0.01",