From b0bea44bc14fe9cf186401620c848690c1ca8d45 Mon Sep 17 00:00:00 2001 From: Chandra Sanapala Date: Thu, 7 Nov 2024 16:13:45 +0530 Subject: [PATCH 1/2] feat: Update operator to update a table --- proto/substrait/algebra.proto | 30 ++++++++++++++++++++++-- site/docs/relations/logical_relations.md | 30 ++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 3dba8ac9a..b0eb1aeca 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -531,6 +531,7 @@ message Rel { ReferenceRel reference = 21; WriteRel write = 19; DdlRel ddl = 20; + UpdateRel update = 22; // Physical relations HashJoinRel hash_join = 13; MergeJoinRel merge_join = 14; @@ -619,7 +620,7 @@ message WriteRel { // The type of operation to perform WriteOp op = 4; - // The relation that determines the records to add/remove/modify + // The relation that determines the records to add/remove // the schema must match with table_schema. Default values must be explicitly stated // in a ProjectRel at the top of the input. The match must also // occur in case of DELETE to ensure multi-engine plans are unequivocal. @@ -638,7 +639,7 @@ message WriteRel { // The removal of records from a table WRITE_OP_DELETE = 2; // The modification of existing records within a table - WRITE_OP_UPDATE = 3; + WRITE_OP_UPDATE = 3 [deprecated = true]; // The Creation of a new table, and the insert of new records in the table WRITE_OP_CTAS = 4; } @@ -664,6 +665,31 @@ message WriteRel { } } +// The operator that modifies the columns of a table +message UpdateRel { + oneof update_type { + NamedTable named_table = 1; + } + + NamedStruct table_schema = 2; // The full schema of the named_table + Expression condition = 3; // condition to be met for the update to be applied on a record + + // The list of transformations to apply to the columns of the named_table + repeated TransformExpression transformations = 4; + + message TransformExpression { + Expression transformation = 1; // the transformation to apply + int32 column_target = 2; // index of the column to apply the transformation to + } +} + +// A base table. The list of string is used to represent namespacing (e.g., mydb.mytable). +// This assumes shared catalog between systems exchanging a message. +message NamedTable { + repeated string names = 1; + substrait.extensions.AdvancedExtension advanced_extension = 10; +} + // Hash joins and merge joins are a specialization of the general join where the join // expression is an series of comparisons between fields that are ANDed together. The // behavior of this comparison is flexible diff --git a/site/docs/relations/logical_relations.md b/site/docs/relations/logical_relations.md index 41e7ae3ee..ac8a7c661 100644 --- a/site/docs/relations/logical_relations.md +++ b/site/docs/relations/logical_relations.md @@ -420,7 +420,7 @@ doing `ReferenceRel(0) JOIN D`. This allows to avoid the redundancy of `A JOIN B ## Write Operator -The write operator is an operator that consumes one input and writes it to storage. This can range from writing to a Parquet file, to INSERT/DELETE/UPDATE in a database. +The write operator is an operator that consumes one input and writes it to storage. This can range from writing to a Parquet file, to INSERT/DELETE in a database. | Signature | Value | | -------------------- |--------------------------------------------------------- | @@ -436,7 +436,7 @@ The write operator is an operator that consumes one input and writes it to stora |----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| | Write Type | Definition of which object we are operating on (e.g., a fully-qualified table name). | Required | | CTAS Schema | The names of all the columns and their type for a CREATE TABLE AS. | Required only for CTAS | -| Write Operator | Which type of operation we are performing (INSERT/DELETE/UPDATE/CTAS). | Required | +| Write Operator | Which type of operation we are performing (INSERT/DELETE/CTAS). | Required | | Rel Input | The Rel representing which records we will be operating on (e.g., VALUES for an INSERT, or which records to DELETE, or records and after-image of their values for UPDATE). | Required | | Create Mode | This determines what should happen if the table already exists (ERROR/REPLACE/IGNORE) | Required only for CTAS | | Output Mode | For views that modify a DB it is important to control which records to "return". Common default is NO_OUTPUT where we return nothing. Alternatively, we can return MODIFIED_RECORDS, that can be further manipulated by layering more rels ontop of this WriteRel (e.g., to "count how many records were updated"). This also allows to return the after-image of the change. To return before-image (or both) one can use the reference mechanisms and have multiple return values. | Required for VIEW CREATE/CREATE_OR_REPLACE/ALTER | @@ -474,6 +474,32 @@ Write definition types are built by the community and added to the specification | Format | Enumeration of available formats. Only current option is PARQUET. | Required | +## Update Operator + +The update operator is an operator that consumes one input and writes the column updates to a storage. + +| Signature | Value | +| -------------------- |---------------------------------------| +| Inputs | 1 | +| Outputs | 1 | +| Property Maintenance | Output is number of modified records | + +### Update Properties + +| Property | Description | Required | +|------------------------|--------------------------------------------------------------------------------------|--------------------------------------------------| +| Update Type | Definition of which object we are operating on (e.g., a fully-qualified table name). | Required | +| Table Schema | The names and types of all the columns of the input table | Required | +| Update Condition | The condition that must be met for a record to be updated. | Required | +| Update Transformations | The set of column updates to be applied to the table. | Required | + +=== "UpdateRel Message" + + ```proto +%%% proto.algebra.UpdateRel %%% + ``` + + ## DDL (Data Definition Language) Operator The operator that defines modifications of a database schema (CREATE/DROP/ALTER for TABLE and VIEWS). From c82300f3dbceb16459b34ae4b5f23d45798d5a2c Mon Sep 17 00:00:00 2001 From: Chandra Sanapala Date: Fri, 8 Nov 2024 14:27:11 +0530 Subject: [PATCH 2/2] review comments --- proto/substrait/algebra.proto | 4 ++-- site/docs/relations/logical_relations.md | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index b0eb1aeca..a3b9df7fb 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -620,7 +620,7 @@ message WriteRel { // The type of operation to perform WriteOp op = 4; - // The relation that determines the records to add/remove + // The relation that determines the records to add/remove/modify // the schema must match with table_schema. Default values must be explicitly stated // in a ProjectRel at the top of the input. The match must also // occur in case of DELETE to ensure multi-engine plans are unequivocal. @@ -639,7 +639,7 @@ message WriteRel { // The removal of records from a table WRITE_OP_DELETE = 2; // The modification of existing records within a table - WRITE_OP_UPDATE = 3 [deprecated = true]; + WRITE_OP_UPDATE = 3; // The Creation of a new table, and the insert of new records in the table WRITE_OP_CTAS = 4; } diff --git a/site/docs/relations/logical_relations.md b/site/docs/relations/logical_relations.md index ac8a7c661..9fd5d600f 100644 --- a/site/docs/relations/logical_relations.md +++ b/site/docs/relations/logical_relations.md @@ -420,7 +420,7 @@ doing `ReferenceRel(0) JOIN D`. This allows to avoid the redundancy of `A JOIN B ## Write Operator -The write operator is an operator that consumes one input and writes it to storage. This can range from writing to a Parquet file, to INSERT/DELETE in a database. +The write operator is an operator that consumes one input and writes it to storage. This can range from writing to a Parquet file, to INSERT/DELETE/UPDATE in a database. | Signature | Value | | -------------------- |--------------------------------------------------------- | @@ -436,7 +436,7 @@ The write operator is an operator that consumes one input and writes it to stora |----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| | Write Type | Definition of which object we are operating on (e.g., a fully-qualified table name). | Required | | CTAS Schema | The names of all the columns and their type for a CREATE TABLE AS. | Required only for CTAS | -| Write Operator | Which type of operation we are performing (INSERT/DELETE/CTAS). | Required | +| Write Operator | Which type of operation we are performing (INSERT/DELETE/UPDATE/CTAS). | Required | | Rel Input | The Rel representing which records we will be operating on (e.g., VALUES for an INSERT, or which records to DELETE, or records and after-image of their values for UPDATE). | Required | | Create Mode | This determines what should happen if the table already exists (ERROR/REPLACE/IGNORE) | Required only for CTAS | | Output Mode | For views that modify a DB it is important to control which records to "return". Common default is NO_OUTPUT where we return nothing. Alternatively, we can return MODIFIED_RECORDS, that can be further manipulated by layering more rels ontop of this WriteRel (e.g., to "count how many records were updated"). This also allows to return the after-image of the change. To return before-image (or both) one can use the reference mechanisms and have multiple return values. | Required for VIEW CREATE/CREATE_OR_REPLACE/ALTER | @@ -476,11 +476,11 @@ Write definition types are built by the community and added to the specification ## Update Operator -The update operator is an operator that consumes one input and writes the column updates to a storage. +The update operator applies a set of column transformations on a named table and writes to a storage. | Signature | Value | | -------------------- |---------------------------------------| -| Inputs | 1 | +| Inputs | 0 | | Outputs | 1 | | Property Maintenance | Output is number of modified records |