-
Notifications
You must be signed in to change notification settings - Fork 394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
initial gRPC spec changes for supporting index creation and querying #8829
Changes from 5 commits
5048299
c58ae63
23ca90a
da5b224
463ae84
43abe0d
ae643de
7ed9ab2
d3722fb
8ab34a8
33d70db
744b69c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,16 @@ service StorageNode { | |
rpc Query(QueryRequest) returns (stream DataframePart) {} | ||
rpc FetchRecording(FetchRecordingRequest) returns (stream rerun.common.v0.RerunChunk) {} | ||
|
||
rpc IndexCollection(IndexCollectionRequest) returns (IndexCollectionResponse) {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In addition to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracking which recordings are in the index so we know whether reindexing is necessary would be nice to think about, but not necessary yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://github.com/rerun-io/dataplatform/issues/156 This will be tackled in a first follow up. |
||
// Collection index query response is a RecordBatch with 2 columns: | ||
zehiko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// - 'resource_id' column with the id of the resource | ||
// - timepoint column with the values reprensenting the points in time | ||
// where index query matches. What time points are matched depends on the type of | ||
// index that is queried. For example for vector search it might be timepoints where | ||
// top-K matches are found within *each* resource in the collection. For inverted index | ||
// it might be timepoints where the query string is found in the indexed column | ||
rpc QueryCollectionIndex(QueryCollectionIndexRequest) returns (stream DataframePart) {} | ||
|
||
// metadata API calls | ||
rpc QueryCatalog(QueryCatalogRequest) returns (stream DataframePart) {} | ||
rpc UpdateCatalog(UpdateCatalogRequest) returns (UpdateCatalogResponse) {} | ||
|
@@ -32,6 +42,108 @@ message DataframePart { | |
bytes payload = 1000; | ||
} | ||
|
||
// ---------------- IndexCollection ------------------ | ||
|
||
message IndexCollectionRequest { | ||
zehiko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// which collection do we want to create index for | ||
Collection collection = 1; | ||
// what kind of index do we want to create and what are | ||
// its index specific properties | ||
IndexProperties properties = 2; | ||
// Component / column we want to index | ||
rerun.common.v0.ComponentColumnDescriptor column = 3; | ||
// What is the filter index i.e. timeline for which we | ||
// will query the timepoints | ||
rerun.common.v0.IndexColumnSelector time_index = 4; | ||
} | ||
|
||
message IndexProperties { | ||
oneof props { | ||
InvertedIndex inverted = 1; | ||
VectorIvfPqIndex vector = 2; | ||
BTreeIndex btree = 3; | ||
} | ||
} | ||
|
||
message InvertedIndex { | ||
bool store_position = 1; | ||
string base_tokenizer = 2; | ||
// TODO(zehiko) add properties as needed | ||
} | ||
|
||
message VectorIvfPqIndex { | ||
uint32 num_partitions = 1; | ||
uint32 num_sub_vectors = 2; | ||
VectorDistanceMetric distance_metrics = 3; | ||
} | ||
|
||
enum VectorDistanceMetric { | ||
L2 = 0; | ||
COSINE = 1; | ||
DOT = 2; | ||
HAMMING = 3; | ||
} | ||
|
||
message BTreeIndex { | ||
// TODO(zehiko) as properties as needed | ||
} | ||
|
||
message IndexCollectionResponse {} | ||
|
||
|
||
// ---------------- QueryCollectionIndex ------------------ | ||
|
||
message QueryCollectionIndexRequest { | ||
zehiko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Collection we want to run the query against on | ||
// If not specified, the default collection is queried | ||
Collection collection = 1; | ||
// Index type specific query properties | ||
IndexQuery query = 2; | ||
} | ||
|
||
message IndexQuery { | ||
// specific index query properties based on the index type | ||
oneof query { | ||
InvertedIndexQuery inverted = 1; | ||
VectorIndexQuery vector = 2; | ||
BTreeIndexQuery btree = 3; | ||
} | ||
} | ||
|
||
message InvertedIndexQuery { | ||
// Query to execute represented as the arrow data | ||
// Query should be a unit RecordBatch with 2 columns: | ||
// - 'index' column with the name of the column we want to query | ||
zehiko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// - 'query' column with the value we want to query. It must be | ||
// of utf8 type | ||
DataframePart query = 1; | ||
// TODO(zehiko) add properties as needed | ||
} | ||
|
||
message VectorIndexQuery { | ||
// Query to execute represented as the arrow data | ||
// Query should be a unit RecordBatch with 2 columns: | ||
// - 'index' column with the name of the column we want to query | ||
// - 'query' column with the value we want to query. It must be of | ||
// type of float32 array | ||
DataframePart query = 1; | ||
uint32 top_k = 2; | ||
} | ||
|
||
message BTreeIndexQuery { | ||
// Query to execute represented as the arrow data | ||
// Query should be a unit RecordBatch with 2 columns: | ||
// - 'index' column with the name of the column we want to query | ||
// - 'query' column with the value we want to query. The type should | ||
// be of the same type as the indexed column | ||
DataframePart query = 1; | ||
// TODO(zehiko) add properties as needed | ||
} | ||
|
||
message Collection { | ||
string name = 1; | ||
} | ||
|
||
// ---------------- GetRecordingSchema ------------------ | ||
|
||
message GetRecordingSchemaRequest { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right now everything lives under one
service StorageNode
. I wonder if it is cleaner to create several services that groups similar calls logically? Maybe something likeCatalogService
,IndexService
, ...? It might be nice in the future to decide on a fine granular level which services should spin up, for example if we want to distribute loads across multiple VMs?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is still a tight relation between catalog, collection, collection query path, collection index query path. This definitely requires bit of thinking how we split it. I'll create an issue, but I won't tackle this as part of this story.