Skip to content

Commit

Permalink
support sparse dataset
Browse files Browse the repository at this point in the history
Signed-off-by: zhongxiaoyao.zxy <[email protected]>
  • Loading branch information
ShawnShawnYou committed Jan 13, 2025
1 parent 51816d4 commit 5a6c1ef
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/vsag/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern const char* const NUM_ELEMENTS;
extern const char* const IDS;
extern const char* const DISTS;
extern const char* const FLOAT32_VECTORS;
extern const char* const SPARSE_VECTORS;
extern const char* const INT8_VECTORS;
extern const char* const DATASET_PATHS;
extern const char* const HNSW_DATA;
Expand Down
29 changes: 29 additions & 0 deletions include/vsag/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ namespace vsag {
class Dataset;
using DatasetPtr = std::shared_ptr<Dataset>;

struct SparseVectors {
uint32_t num; // num of vectors
uint32_t*
offsets; // vectors offset, len(offsets) = num + 1, offsets[i + 1] - offsets[i] = dim(vector)
// e.g. [0, 2, 5, 10] means len(vec1) = 2 - 0 == 2, len(vec2) = 5 - 2 == 3
uint32_t* ids; // contains offsets[num] ids
float* vals; // contains offsets[num] vals

SparseVectors() : num(0), offsets{nullptr}, ids{nullptr}, vals{nullptr} {
}
};

/**
* @class Dataset
*
Expand Down Expand Up @@ -173,6 +185,23 @@ class Dataset : public std::enable_shared_from_this<Dataset> {
virtual const float*
GetFloat32Vectors() const = 0;

/**
* @brief Sets the sparse vector array for the dataset.
*
* @param vectors Pointer to the struct of sparse vectors.
* @return DatasetPtr A shared pointer to the dataset with updated sparse vectors.
*/
virtual DatasetPtr
SparseVectors(const struct SparseVectors sparse_vectors) = 0;

/**
* @brief Retrieves the sparse struct of the dataset.
*
* @return const SparseVector to the array of sparse vectors.
*/
virtual const struct SparseVectors
GetSparseVectors() const = 0;

/**
* @brief Sets the paths array for the dataset.
*
Expand Down
1 change: 1 addition & 0 deletions src/constants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const char* const NUM_ELEMENTS = "num_elements";
const char* const IDS = "ids";
const char* const DISTS = "dists";
const char* const FLOAT32_VECTORS = "f32_vectors";
const char* const SPARSE_VECTORS = "sparse_vectors";
const char* const INT8_VECTORS = "i8_vectors";
const char* const DATASET_PATHS = "paths";
const char* const HNSW_DATA = "hnsw_data";
Expand Down
32 changes: 30 additions & 2 deletions src/dataset_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,12 @@
namespace vsag {

class DatasetImpl : public Dataset {
using var =
std::variant<int64_t, const float*, const int8_t*, const int64_t*, const std::string*>;
using var = std::variant<int64_t,
const float*,
const int8_t*,
const int64_t*,
const std::string*,
struct SparseVectors>;

public:
DatasetImpl() = default;
Expand All @@ -47,12 +51,20 @@ class DatasetImpl : public Dataset {
allocator_->Deallocate((void*)this->GetInt8Vectors());
allocator_->Deallocate((void*)this->GetFloat32Vectors());
allocator_->Deallocate((void*)this->GetPaths());

allocator_->Deallocate((void*)this->GetSparseVectors().offsets);
allocator_->Deallocate((void*)this->GetSparseVectors().ids);
allocator_->Deallocate((void*)this->GetSparseVectors().vals);
} else {
delete[] this->GetIds();
delete[] this->GetDistances();
delete[] this->GetInt8Vectors();
delete[] this->GetFloat32Vectors();
delete[] this->GetPaths();

delete[] this->GetSparseVectors().offsets;
delete[] this->GetSparseVectors().ids;
delete[] this->GetSparseVectors().vals;
}
}

Expand Down Expand Up @@ -165,6 +177,22 @@ class DatasetImpl : public Dataset {
return nullptr;
}

DatasetPtr
SparseVectors(const struct SparseVectors sparse_vectors) override {
this->data_[SPARSE_VECTORS] = sparse_vectors;
return shared_from_this();
}

const struct SparseVectors
GetSparseVectors() const override {
if (auto iter = this->data_.find(SPARSE_VECTORS); iter != this->data_.end()) {
return std::get<struct SparseVectors>(iter->second);
}

struct SparseVectors null_sparse_vectors;
return null_sparse_vectors;
}

DatasetPtr
Paths(const std::string* paths) override {
this->data_[DATASET_PATHS] = paths;
Expand Down
7 changes: 7 additions & 0 deletions src/dataset_impl_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <catch2/catch_test_macros.hpp>

#include "default_allocator.h"
#include "fixtures.h"
#include "vsag/dataset.h"

TEST_CASE("test dataset", "[ut][dataset]") {
Expand All @@ -40,4 +41,10 @@ TEST_CASE("test dataset", "[ut][dataset]") {
dataset->Float32Vectors(data)->Owner(false);
delete[] data;
}

SECTION("sparse vector") {
auto dataset = vsag::Dataset::Make();
vsag::SparseVectors sparse_vectors = fixtures::GenerateSparseVectors(100);
dataset->SparseVectors(sparse_vectors)->Owner(true);
}
}
1 change: 1 addition & 0 deletions src/inner_string_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ const char* const QUANTIZATION_TYPE_VALUE_SQ8_UNIFORM = "sq8_uniform";
const char* const QUANTIZATION_TYPE_VALUE_SQ4 = "sq4";
const char* const QUANTIZATION_TYPE_VALUE_SQ4_UNIFORM = "sq4_uniform";
const char* const QUANTIZATION_TYPE_VALUE_FP32 = "fp32";
const char* const QUANTIZATION_TYPE_VALUE_SPARSE = "fp32";
const char* const QUANTIZATION_TYPE_VALUE_PQ = "pq";

// quantization params key
Expand Down
30 changes: 30 additions & 0 deletions tests/fixtures/fixtures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,36 @@ get_common_used_dims(uint64_t count, int seed) {
return result;
}

vsag::SparseVectors
GenerateSparseVectors(uint32_t count, uint32_t dim_limit, int seed) {
std::mt19937 rng(seed);
std::uniform_real_distribution<float> distrib_real;
std::uniform_int_distribution<int> distrib_int(0, dim_limit);

vsag::SparseVectors sparse_vectors;
sparse_vectors.num = count;
sparse_vectors.offsets = new uint32_t[count + 1];

uint32_t offset = 0;
sparse_vectors.offsets[0] = offset;
for (int i = 0; i < sparse_vectors.num; i++) {
offset += distrib_int(rng);
sparse_vectors.offsets[i + 1] = offset;
}

sparse_vectors.ids = new uint32_t[sparse_vectors.offsets[count]];
sparse_vectors.vals = new float[sparse_vectors.offsets[count]];

for (uint32_t i = 0; i < sparse_vectors.num; i++) {
for (uint32_t j = sparse_vectors.offsets[i]; j < sparse_vectors.offsets[i + 1]; j++) {
sparse_vectors.ids[j] = distrib_int(rng);
sparse_vectors.vals[j] = distrib_real(rng);
}
}

return sparse_vectors;
}

std::vector<float>
generate_vectors(uint64_t count, uint32_t dim, bool need_normalize, int seed) {
return std::move(GenerateVectors<float>(count, dim, seed, need_normalize));
Expand Down
3 changes: 3 additions & 0 deletions tests/fixtures/fixtures.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ GenerateVectors(uint64_t count, uint32_t dim, int seed = 47, bool need_normalize
return vectors;
}

vsag::SparseVectors
GenerateSparseVectors(uint32_t count, uint32_t dim_limit = 100, int seed = 47);

std::vector<float>
generate_vectors(uint64_t count, uint32_t dim, bool need_normalize = true, int seed = 47);

Expand Down

0 comments on commit 5a6c1ef

Please sign in to comment.