Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faiss cagra bench #2216

Draft
wants to merge 5 commits into
base: branch-24.06
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ if(RAFT_COMPILE_LIBRARY)
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu
Expand All @@ -361,6 +365,10 @@ if(RAFT_COMPILE_LIBRARY)
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu
Expand Down
56 changes: 23 additions & 33 deletions cpp/bench/ann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ list(APPEND CMAKE_MODULE_PATH "${RAFT_SOURCE_DIR}")
option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
option(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA "Include faiss' cagra algorithm wrapper in benchmark" ON)
option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON)

option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark"
Expand Down Expand Up @@ -52,16 +53,19 @@ if(BUILD_CPU_ONLY)
include(cmake/thirdparty/get_fmt)
include(cmake/thirdparty/get_spdlog)
set(RAFT_FAISS_ENABLE_GPU OFF)
set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
set(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA OFF)
set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
set(RAFT_ANN_BENCH_USE_GGNN OFF)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
# Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
# https://github.com/rapidsai/raft/issues/1627
set(RAFT_FAISS_ENABLE_GPU OFF)
# elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) # Disable faiss benchmarks
# on CUDA 12 since faiss is not yet CUDA 12-enabled. #
# https://github.com/rapidsai/raft/issues/1627 set(RAFT_FAISS_ENABLE_GPU ON)
endif()

set(RAFT_ANN_BENCH_USE_RAFT OFF)
Expand Down Expand Up @@ -198,40 +202,25 @@ endfunction()
# * Configure tests-------------------------------------------------------------

if(RAFT_ANN_BENCH_USE_HNSWLIB)
ConfigureAnnBench(
NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
)
ConfigureAnnBench(NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib)

endif()

if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
ConfigureAnnBench(
NAME
RAFT_IVF_PQ
PATH
src/raft/raft_benchmark.cu
src/raft/raft_ivf_pq.cu
LINKS
raft::compiled
NAME RAFT_IVF_PQ PATH src/raft/raft_benchmark.cu src/raft/raft_ivf_pq.cu LINKS raft::compiled
)
endif()

if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
ConfigureAnnBench(
NAME
RAFT_IVF_FLAT
PATH
src/raft/raft_benchmark.cu
src/raft/raft_ivf_flat.cu
LINKS
NAME RAFT_IVF_FLAT PATH src/raft/raft_benchmark.cu src/raft/raft_ivf_flat.cu LINKS
raft::compiled
)
endif()

if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE)
ConfigureAnnBench(
NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled
)
ConfigureAnnBench(NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled)
endif()

if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
Expand Down Expand Up @@ -260,36 +249,37 @@ message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}")
message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT)
ConfigureAnnBench(
NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
${RAFT_FAISS_TARGETS}
NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
)
endif()

if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
ConfigureAnnBench(
NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
${RAFT_FAISS_TARGETS}
NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
)
endif()

if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
ConfigureAnnBench(
NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
${RAFT_FAISS_TARGETS}
NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${RAFT_FAISS_TARGETS}
)
endif()

if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND RAFT_FAISS_ENABLE_GPU)
ConfigureAnnBench(
NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
${RAFT_FAISS_TARGETS}
NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
)
endif()

if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND RAFT_FAISS_ENABLE_GPU)
ConfigureAnnBench(
NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
${RAFT_FAISS_TARGETS}
NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
)
endif()

if(RAFT_ANN_BENCH_USE_FAISS_GPU_CAGRA AND RAFT_FAISS_ENABLE_GPU)
ConfigureAnnBench(
NAME FAISS_GPU_CAGRA PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
)
endif()

Expand Down
1 change: 1 addition & 0 deletions cpp/bench/ann/src/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ void bench_search(::benchmark::State& state,

// Each thread calculates recall on their partition of queries.
// evaluate recall
// std::cout << "max k: " << dataset->max_k() << std::endl;
if (dataset->max_k() >= k) {
const std::int32_t* gt = dataset->gt_set();
const std::uint32_t max_k = dataset->max_k();
Expand Down
22 changes: 22 additions & 0 deletions cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ void parse_build_param(const nlohmann::json& conf,
param.quantizer_type = conf.at("quantizer_type");
}

template <typename T>
void parse_build_param(const nlohmann::json& conf,
typename raft::bench::ann::FaissCpuHNSW<T>::BuildParam& param)
{
param.ef_construction = conf.at("efConstruction");
param.M = conf.at("M");
}

template <typename T>
void parse_search_param(const nlohmann::json& conf,
typename raft::bench::ann::FaissCpu<T>::SearchParam& param)
Expand All @@ -80,6 +88,14 @@ void parse_search_param(const nlohmann::json& conf,
if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
}

template <typename T>
void parse_search_param(const nlohmann::json& conf,
typename raft::bench::ann::FaissCpuHNSW<T>::SearchParam& param)
{
param.ef = conf.at("ef");
if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
}

template <typename T, template <typename> class Algo>
std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
int dim,
Expand Down Expand Up @@ -125,6 +141,8 @@ std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
ann = make_algo<T, raft::bench::ann::FaissCpuIVFSQ>(metric, dim, conf);
} else if (algo == "faiss_cpu_flat") {
ann = std::make_unique<raft::bench::ann::FaissCpuFlat<T>>(metric, dim);
} else if (algo == "faiss_cpu_hnsw") {
ann = make_algo<T, raft::bench::ann::FaissCpuHNSW>(metric, dim, conf);
}
}

Expand All @@ -146,6 +164,10 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
} else if (algo == "faiss_cpu_flat") {
auto param = std::make_unique<typename raft::bench::ann::FaissCpu<T>::SearchParam>();
return param;
} else if (algo == "faiss_cpu_hnsw") {
auto param = std::make_unique<typename raft::bench::ann::FaissCpuHNSW<T>::SearchParam>();
parse_search_param<T>(conf, *param);
return param;
}
// else
throw std::runtime_error("invalid algo: '" + algo + "'");
Expand Down
114 changes: 85 additions & 29 deletions cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,16 @@
#pragma once

#include "../common/ann_types.hpp"
#include "../common/thread_pool.hpp"

#include <raft/core/logger.hpp>

#include <faiss/IndexFlat.h>
#include <faiss/IndexHNSW.h>
#include <faiss/IndexIVFFlat.h>
#include <faiss/IndexIVFPQ.h>
#include <faiss/IndexRefine.h>
#include <faiss/IndexScalarQuantizer.h>
#include <faiss/impl/HNSW.h>
#include <faiss/index_io.h>

#include <cassert>
Expand Down Expand Up @@ -73,7 +74,7 @@ class FaissCpu : public ANN<T> {
static_assert(std::is_same_v<T, float>, "faiss support only float type");
}

void build(const T* dataset, size_t nrow) final;
virtual void build(const T* dataset, size_t nrow);

void set_search_param(const AnnSearchParam& param) override;

Expand All @@ -88,11 +89,11 @@ class FaissCpu : public ANN<T> {

// TODO: if the number of results is less than k, the remaining elements of 'neighbors'
// will be filled with (size_t)-1
void search(const T* queries,
int batch_size,
int k,
AnnBase::index_type* neighbors,
float* distances) const final;
virtual void search(const T* queries,
int batch_size,
int k,
AnnBase::index_type* neighbors,
float* distances) const;

AlgoProperty get_preference() const override
{
Expand All @@ -116,9 +117,6 @@ class FaissCpu : public ANN<T> {
faiss::MetricType metric_type_;
int nlist_;
double training_sample_fraction_;

int num_threads_;
std::shared_ptr<FixedThreadPool> thread_pool_;
};

template <typename T>
Expand Down Expand Up @@ -149,12 +147,14 @@ void FaissCpu<T>::build(const T* dataset, size_t nrow)
index_->train(nrow, dataset); // faiss::IndexFlat::train() will do nothing
assert(index_->is_trained);
index_->add(nrow, dataset);
std::cout << "finished adding" << std::endl;
index_refine_ = std::make_shared<faiss::IndexRefineFlat>(this->index_.get(), dataset);
}

template <typename T>
void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
{
std::cout << "should not be here" << std::endl;
auto search_param = dynamic_cast<const SearchParam&>(param);
int nprobe = search_param.nprobe;
assert(nprobe <= nlist_);
Expand All @@ -164,9 +164,12 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
this->index_refine_.get()->k_factor = search_param.refine_ratio;
}

if (!thread_pool_ || num_threads_ != search_param.num_threads) {
num_threads_ = search_param.num_threads;
thread_pool_ = std::make_shared<FixedThreadPool>(num_threads_);
if (param.metric_objective == Objective::LATENCY) {
// Let FAISS use its internal threading model with user defined `numThreads`
omp_set_num_threads(search_param.num_threads);
} else if (param.metric_objective == Objective::THROUGHPUT) {
// FAISS is not allowed to internally parallelize
omp_set_num_threads(1);
}
}

Expand All @@ -177,12 +180,7 @@ void FaissCpu<T>::search(
static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
"sizes of size_t and faiss::idx_t are different");

thread_pool_->submit(
[&](int i) {
// Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1.
index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
},
1);
index_->search(batch_size, queries, k, distances, reinterpret_cast<faiss::idx_t*>(neighbors));
}

template <typename T>
Expand Down Expand Up @@ -301,16 +299,6 @@ class FaissCpuFlat : public FaissCpu<T> {
this->index_ = std::make_shared<faiss::IndexFlat>(dim, this->metric_type_);
}

// class FaissCpu is more like a IVF class, so need special treating here
void set_search_param(const typename ANN<T>::AnnSearchParam& param) override
{
auto search_param = dynamic_cast<const typename FaissCpu<T>::SearchParam&>(param);
if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) {
this->num_threads_ = search_param.num_threads;
this->thread_pool_ = std::make_shared<FixedThreadPool>(this->num_threads_);
}
};

void save(const std::string& file) const override
{
this->template save_<faiss::IndexFlat>(file);
Expand All @@ -323,4 +311,72 @@ class FaissCpuFlat : public FaissCpu<T> {
}
};

template <typename T>
class FaissCpuHNSW : public FaissCpu<T> {
public:
struct BuildParam {
int M;
int ef_construction;
};

using typename ANN<T>::AnnSearchParam;
struct SearchParam : public AnnSearchParam {
int ef;
int num_threads;
};

FaissCpuHNSW(Metric metric, int dim, const BuildParam& param)
: FaissCpu<T>(metric, dim, typename FaissCpu<T>::BuildParam())
{
this->index_ = std::make_shared<faiss::IndexHNSWFlat>(param.M, dim, this->metric_type_);
dynamic_cast<faiss::IndexHNSWFlat*>(this->index_.get())->hnsw.efConstruction =
param.ef_construction;
}

void save(const std::string& file) const override
{
this->template save_<faiss::IndexHNSWFlat>(file);
}
void load(const std::string& file) override { this->template load_<faiss::IndexHNSWFlat>(file); }

std::unique_ptr<ANN<T>> copy()
{
return std::make_unique<FaissCpuHNSW<T>>(*this); // use copy constructor
}

void set_search_param(const AnnSearchParam& param) override
{
auto search_param = dynamic_cast<const SearchParam&>(param);
if (search_param.metric_objective == Objective::LATENCY) {
// Let FAISS use its internal threading model with user defined `numThreads`
omp_set_num_threads(search_param.num_threads);
} else if (search_param.metric_objective == Objective::THROUGHPUT) {
// FAISS is not allowed to internally parallelize
omp_set_num_threads(1);
}
search_params_.efSearch = search_param.ef;
}

void build(const T* dataset, size_t nrow) override
{
this->index_->train(nrow, dataset); // faiss::IndexHNSWFlat::train() will do nothing
assert(this->index_->is_trained);
this->index_->add(nrow, dataset);
std::cout << "finished adding" << std::endl;
}

void search(
const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override
{
this->index_->search(batch_size,
queries,
k,
distances,
reinterpret_cast<faiss::idx_t*>(neighbors),
&search_params_);
}

private:
faiss::SearchParametersHNSW search_params_;
};
} // namespace raft::bench::ann
Loading
Loading