Skip to content

Commit

Permalink
Merge remote-tracking branch 'vsag_main/main' into add_batch_vid_dis
Browse files Browse the repository at this point in the history
  • Loading branch information
zourunxin.zrx committed Jan 20, 2025
2 parents a927930 + 892cbe0 commit 072c929
Show file tree
Hide file tree
Showing 80 changed files with 3,514 additions and 166 deletions.
91 changes: 90 additions & 1 deletion examples/cpp/todo_examples/101_index_hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,93 @@
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.

#include <vsag/vsag.h>

#include <iostream>

int
main(int argc, char** argv) {
/******************* Prepare Base Dataset *****************/
int64_t num_vectors = 1000;
int64_t dim = 128;
auto ids = new int64_t[num_vectors];
auto vectors = new float[dim * num_vectors];

std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
for (int64_t i = 0; i < num_vectors; ++i) {
ids[i] = i;
}
for (int64_t i = 0; i < dim * num_vectors; ++i) {
vectors[i] = distrib_real(rng);
}
auto base = vsag::Dataset::Make();
// Transfer the ownership of the data (ids, vectors) to the base.
base->NumElements(num_vectors)->Dim(dim)->Ids(ids)->Float32Vectors(vectors);

/******************* Create HNSW Index *****************/
// hnsw_build_parameters is the configuration for building an HNSW index.
// The "dtype" specifies the data type, which supports float32 and int8.
// The "metric_type" indicates the distance metric type (e.g., cosine, inner product, and L2).
// The "dim" represents the dimensionality of the vectors, indicating the number of features for each data point.
// The "hnsw" section contains parameters specific to HNSW:
// - "max_degree": The maximum number of connections for each node in the graph.
// - "ef_construction": The size used for nearest neighbor search during graph construction, which affects both speed and the quality of the graph.
auto hnsw_build_paramesters = R"(
{
"dtype": "float32",
"metric_type": "l2",
"dim": 128,
"hnsw": {
"max_degree": 16,
"ef_construction": 100
}
}
)";
auto index = vsag::Factory::CreateIndex("hnsw", hnsw_build_paramesters).value();

/******************* Build HNSW Index *****************/
if (auto build_result = index->Build(base); build_result.has_value()) {
std::cout << "After Build(), Index HNSW contains: " << index->GetNumElements() << std::endl;
} else {
std::cerr << "Failed to build index: " << build_result.error().message << std::endl;
exit(-1);
}

/******************* KnnSearch For HNSW Index *****************/
auto query_vector = new float[dim];
for (int64_t i = 0; i < dim; ++i) {
query_vector[i] = distrib_real(rng);
}

// hnsw_search_parameters is the configuration for searching in an HNSW index.
// The "hnsw" section contains parameters specific to the search operation:
// - "ef_search": The size of the dynamic list used for nearest neighbor search, which influences both recall and search speed.
auto hnsw_search_parameters = R"(
{
"hnsw": {
"ef_search": 100
}
}
)";
int64_t topk = 10;
auto query = vsag::Dataset::Make();
query->NumElements(1)->Dim(dim)->Float32Vectors(query_vector)->Owner(true);
auto knn_result = index->KnnSearch(query, topk, hnsw_search_parameters);

/******************* Print Search Result *****************/
if (knn_result.has_value()) {
auto result = knn_result.value();
std::cout << "results: " << std::endl;
for (int64_t i = 0; i < result->GetDim(); ++i) {
std::cout << result->GetIds()[i] << ": " << result->GetDistances()[i] << std::endl;
}
} else {
std::cerr << "Search Error: " << knn_result.error().message << std::endl;
}

return 0;
}
109 changes: 108 additions & 1 deletion examples/cpp/todo_examples/102_index_diskann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,111 @@
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.

#include <vsag/vsag.h>

#include <iostream>

int
main(int argc, char** argv) {
/******************* Prepare Base Dataset *****************/
int64_t num_vectors = 1000;
int64_t dim = 128;
auto ids = new int64_t[num_vectors];
auto vectors = new float[dim * num_vectors];

std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
for (int64_t i = 0; i < num_vectors; ++i) {
ids[i] = i;
}
for (int64_t i = 0; i < dim * num_vectors; ++i) {
vectors[i] = distrib_real(rng);
}
auto base = vsag::Dataset::Make();
// Transfer the ownership of the data (ids, vectors) to the base.
base->NumElements(num_vectors)->Dim(dim)->Ids(ids)->Float32Vectors(vectors);

/******************* Create DiskANN Index *****************/
// diskann_build_paramesters is the configuration for building a DiskANN index.
// The "dtype" specifies the data type, "metric_type" indicates the distance metric,
// and "dim" represents the dimensionality of the feature vectors.
// The "diskann" section contains parameters specific to DiskANN:
// - "max_degree": Maximum degree of the graph
// - "ef_construction": Construction phase efficiency factor
// - "pq_sample_rate": PQ sampling rate
// - "pq_dims": PQ dimensionality
// - "use_pq_search": Indicates whether to cache the graph in memory and use PQ vectors for retrieval (optional)
// - "use_async_io": Specifies whether to use asynchronous I/O (optional)
// - "use_bsa": Determines whether to use the BSA method for lossless filtering during the reordering phase (optional)
// Other parameters are mandatory.
auto diskann_build_paramesters = R"(
{
"dtype": "float32",
"metric_type": "l2",
"dim": 128,
"diskann": {
"max_degree": 16,
"ef_construction": 200,
"pq_sample_rate": 0.5,
"pq_dims": 9,
"use_pq_search": true,
"use_async_io": true,
"use_bsa": true
}
}
)";
auto index = vsag::Factory::CreateIndex("diskann", diskann_build_paramesters).value();

/******************* Build DiskANN Index *****************/
if (auto build_result = index->Build(base); build_result.has_value()) {
std::cout << "After Build(), Index DiskANN contains: " << index->GetNumElements()
<< std::endl;
} else {
std::cerr << "Failed to build index: " << build_result.error().message << std::endl;
exit(-1);
}

/******************* KnnSearch For DiskANN Index *****************/
auto query_vector = new float[dim];
for (int64_t i = 0; i < dim; ++i) {
query_vector[i] = distrib_real(rng);
}

// diskann_search_parameters is the configuration for searching in a DiskANN index.
// The "diskann" section contains parameters specific to the search operation:
// - "ef_search": The search efficiency factor, which influences accuracy and speed.
// - "beam_search": The number of beams to use during the search process, balancing exploration and exploitation.
// - "io_limit": The maximum number of I/O operations allowed during the search.
// - "use_reorder": Indicates whether to perform reordering of results for better accuracy (optional).

auto diskann_search_parameters = R"(
{
"diskann": {
"ef_search": 100,
"beam_search": 4,
"io_limit": 50,
"use_reorder": true
}
}
)";
int64_t topk = 10;
auto query = vsag::Dataset::Make();
query->NumElements(1)->Dim(dim)->Float32Vectors(query_vector)->Owner(true);
auto knn_result = index->KnnSearch(query, topk, diskann_search_parameters);

/******************* Print Search Result *****************/
if (knn_result.has_value()) {
auto result = knn_result.value();
std::cout << "results: " << std::endl;
for (int64_t i = 0; i < result->GetDim(); ++i) {
std::cout << result->GetIds()[i] << ": " << result->GetDistances()[i] << std::endl;
}
} else {
std::cerr << "Search Error: " << knn_result.error().message << std::endl;
}

return 0;
}
94 changes: 93 additions & 1 deletion examples/cpp/todo_examples/104_index_fresh_hnsw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,96 @@
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.

#include <vsag/vsag.h>

#include <iostream>

int
main(int argc, char** argv) {
/******************* Prepare Base Dataset *****************/
int64_t num_vectors = 1000;
int64_t dim = 128;
auto ids = new int64_t[num_vectors];
auto vectors = new float[dim * num_vectors];

std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
for (int64_t i = 0; i < num_vectors; ++i) {
ids[i] = i;
}
for (int64_t i = 0; i < dim * num_vectors; ++i) {
vectors[i] = distrib_real(rng);
}
auto base = vsag::Dataset::Make();
// Transfer the ownership of the data (ids, vectors) to the base.
base->NumElements(num_vectors)->Dim(dim)->Ids(ids)->Float32Vectors(vectors);

/******************* Create FreshHnsw Index *****************/
// fresh_hnsw_build_parameters is the configuration for building an FreshHNSW index.
// The "dtype" specifies the data type, which supports float32 and int8.
// The "metric_type" indicates the distance metric type (e.g., cosine, inner product, and L2).
// The "dim" represents the dimensionality of the vectors, indicating the number of features for each data point.
// The "fresh_hnsw" section contains parameters specific to FreshHNSW:
// - "max_degree": The maximum number of connections for each node in the graph.
// - "ef_construction": The size used for nearest neighbor search during graph construction, which affects both speed and the quality of the graph.
auto fresh_hnsw_build_paramesters = R"(
{
"dtype": "float32",
"metric_type": "l2",
"dim": 128,
"fresh_hnsw": {
"max_degree": 16,
"ef_construction": 100
}
}
)";
// The difference between HNSW and FreshHNSW is that FreshHNSW supports actual deletions, while HNSW only supports marked deletions. However, FreshHNSW incurs double the graph storage cost of HNSW due to the need to store reverse edges.
auto index = vsag::Factory::CreateIndex("fresh_hnsw", fresh_hnsw_build_paramesters).value();

/******************* Build FreshHnsw Index *****************/
if (auto build_result = index->Build(base); build_result.has_value()) {
std::cout << "After Build(), Index FreshHnsw contains: " << index->GetNumElements()
<< std::endl;
} else {
std::cerr << "Failed to build index: " << build_result.error().message << std::endl;
exit(-1);
}

/******************* KnnSearch For FreshHnsw Index *****************/
auto query_vector = new float[dim];
for (int64_t i = 0; i < dim; ++i) {
query_vector[i] = distrib_real(rng);
}

// fresh_hnsw_search_parameters is the configuration for searching in an FreshHNSW index.
// The "fresh_hnsw" section contains parameters specific to the search operation:
// - "ef_search": The size of the dynamic list used for nearest neighbor search, which influences both recall and search speed.

auto fresh_hnsw_search_parameters = R"(
{
"fresh_hnsw": {
"ef_search": 100
}
}
)";
int64_t topk = 10;
auto query = vsag::Dataset::Make();
query->NumElements(1)->Dim(dim)->Float32Vectors(query_vector)->Owner(true);
auto knn_result = index->KnnSearch(query, topk, fresh_hnsw_search_parameters);

/******************* Print Search Result *****************/
if (knn_result.has_value()) {
auto result = knn_result.value();
std::cout << "results: " << std::endl;
for (int64_t i = 0; i < result->GetDim(); ++i) {
std::cout << result->GetIds()[i] << ": " << result->GetDistances()[i] << std::endl;
}
} else {
std::cerr << "Search Error: " << knn_result.error().message << std::endl;
}

return 0;
}
Loading

0 comments on commit 072c929

Please sign in to comment.