Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VReplication: Properly support cancel and delete for multi-tenant MoveTables #16906

Merged
merged 34 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
d1a7412
Properly support cancel for multi-tenant movetables
mattlord Oct 7, 2024
435ff92
Minor changes from review
mattlord Oct 7, 2024
6064bad
Cleanup errors
mattlord Oct 7, 2024
d7e86ec
Move work to DeleteTenantData tabletmanager RPC
mattlord Oct 7, 2024
68d28b9
Start working on tests
mattlord Oct 7, 2024
8a20834
Add unit test
mattlord Oct 7, 2024
db2d963
Add to unit tests
mattlord Oct 8, 2024
c9f57eb
Merge remote-tracking branch 'origin/main' into multi_tenant_movetabl…
mattlord Oct 8, 2024
fcb5c6d
Add batch size client flag for cancel and delete
mattlord Oct 8, 2024
4513e22
Add new flag to cli test
mattlord Oct 8, 2024
59aeb3d
Changes from self review
mattlord Oct 8, 2024
77e0e13
Move more work back to workflow server and make RPC more generic
mattlord Oct 9, 2024
19135ec
Add vtctld size unit test cases and fix logic
mattlord Oct 9, 2024
4fcd54d
Remove unintentional changes
mattlord Oct 9, 2024
565b686
Return progress upon error
mattlord Oct 9, 2024
eda4337
Remove context deadline progress handling
mattlord Oct 10, 2024
5435503
Changes from self review
mattlord Oct 10, 2024
9be4ae8
Improve timeout error client message
mattlord Oct 10, 2024
e7374c8
Address review comments
mattlord Oct 15, 2024
a993ce2
Address review comment part II
mattlord Oct 15, 2024
de44c6b
Improve progress log msg
mattlord Oct 15, 2024
6a4c79e
Unify delete/cancel cleanup order
mattlord Oct 15, 2024
4a4c17e
Merge remote-tracking branch 'origin/main' into multi_tenant_movetabl…
mattlord Oct 15, 2024
76131b8
Adjust log msgs since function is not multi-tenant specific
mattlord Oct 15, 2024
76350bd
Make progress logging more intelligent/dynamic
mattlord Oct 15, 2024
5c89bc6
Move workflow locks up one level and hold it during larger operation
mattlord Oct 16, 2024
2ea5e5b
We don't need to lock the keyspace while deleting tenant rows
mattlord Oct 16, 2024
aa567d7
Standardize lock workflow errors
mattlord Oct 16, 2024
18db131
Merge remote-tracking branch 'origin/main' into multi_tenant_movetabl…
mattlord Oct 22, 2024
da401a5
Address review comments
mattlord Oct 22, 2024
ebec349
Fix unit tests after merging in main
mattlord Oct 22, 2024
f31adc9
Implement my TODO
mattlord Oct 22, 2024
399552e
Nitty var name change
mattlord Oct 22, 2024
87d4f68
Comment change from self review of review feedback commits
mattlord Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions go/cmd/vtctldclient/command/vreplication/common/cancel.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"sort"

"github.com/spf13/cobra"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"vitess.io/vitess/go/cmd/vtctldclient/cli"

Expand All @@ -31,6 +33,7 @@ var CancelOptions = struct {
KeepData bool
KeepRoutingRules bool
Shards []string
DeleteBatchSize int64
}{}

func GetCancelCommand(opts *SubCommandsOpts) *cobra.Command {
Expand Down Expand Up @@ -60,9 +63,13 @@ func commandCancel(cmd *cobra.Command, args []string) error {
KeepData: CancelOptions.KeepData,
KeepRoutingRules: CancelOptions.KeepRoutingRules,
Shards: CancelOptions.Shards,
DeleteBatchSize: CancelOptions.DeleteBatchSize,
}
resp, err := GetClient().WorkflowDelete(GetCommandCtx(), req)
if err != nil {
if grpcerr, ok := status.FromError(err); ok && (grpcerr.Code() == codes.DeadlineExceeded) {
return fmt.Errorf("Cancel action timed out. Please try again and the work will pick back up where it left off. Note that you can control the timeout using the --action_timeout flag and the delete batch size with --delete-batch-size.")
}
return err
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ import (
vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata"
)

// The default batch size to use when deleting tenant data
// if a multi-tenant migration is canceled or deleted.
const DefaultDeleteBatchSize = 1000

var (
// base is the base command for all actions related to MoveTables.
base = &cobra.Command{
Expand Down Expand Up @@ -108,6 +112,7 @@ func registerCommands(root *cobra.Command) {
cancel := common.GetCancelCommand(opts)
cancel.Flags().BoolVar(&common.CancelOptions.KeepData, "keep-data", false, "Keep the partially copied table data from the MoveTables workflow in the target keyspace.")
cancel.Flags().BoolVar(&common.CancelOptions.KeepRoutingRules, "keep-routing-rules", false, "Keep the routing rules created for the MoveTables workflow.")
cancel.Flags().Int64Var(&common.CancelOptions.DeleteBatchSize, "delete-batch-size", DefaultDeleteBatchSize, "When cleaning up the migrated data in tables moved as part of a mult-tenant workflow, delete the records in batches of this size.")
common.AddShardSubsetFlag(cancel, &common.CancelOptions.Shards)
base.AddCommand(cancel)
}
Expand Down
2 changes: 2 additions & 0 deletions go/cmd/vtctldclient/command/vreplication/workflow/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ var (
deleteOptions = struct {
KeepData bool
KeepRoutingRules bool
DeleteBatchSize int64
}{}

// delete makes a WorkflowDelete gRPC call to a vtctld.
Expand All @@ -55,6 +56,7 @@ func commandDelete(cmd *cobra.Command, args []string) error {
KeepData: deleteOptions.KeepData,
KeepRoutingRules: deleteOptions.KeepRoutingRules,
Shards: baseOptions.Shards,
DeleteBatchSize: deleteOptions.DeleteBatchSize,
}
resp, err := common.GetClient().WorkflowDelete(common.GetCommandCtx(), req)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/spf13/cobra"

"vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/common"
"vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/movetables"
"vitess.io/vitess/go/vt/topo/topoproto"
)

Expand Down Expand Up @@ -60,6 +61,7 @@ func registerCommands(root *cobra.Command) {
delete.MarkFlagRequired("workflow")
delete.Flags().BoolVar(&deleteOptions.KeepData, "keep-data", false, "Keep the partially copied table data from the workflow in the target keyspace.")
delete.Flags().BoolVar(&deleteOptions.KeepRoutingRules, "keep-routing-rules", false, "Keep the routing rules created for the workflow.")
delete.Flags().Int64Var(&deleteOptions.DeleteBatchSize, "delete-batch-size", movetables.DefaultDeleteBatchSize, "The batch size to use when deleting a subset of data from the migrated tables. This is only used with multi-tenant MoveTables workflows.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this description not be consistent with that given for the same flag in moveables.go for the Cancel sub-command?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch! However, this is a bit of an exception in that the context is different for MoveTables cancel and Workflow delete. It was intentional:

go/cmd/vtctldclient/command/vreplication/movetables/movetables.go:      cancel.Flags().Int64Var(&common.CancelOptions.DeleteBatchSize, "delete-batch-size", DefaultDeleteBatchSize, "When cleaning up the migrated data in tables moved as part of a mult-tenant workflow, delete the records in batches of this size.")
go/cmd/vtctldclient/command/vreplication/workflow/workflow.go:  delete.Flags().Int64Var(&deleteOptions.DeleteBatchSize, "delete-batch-size", movetables.DefaultDeleteBatchSize, "The batch size to use when deleting a subset of data from the migrated tables. This is only used with multi-tenant MoveTables workflows.")

But I'm good tweaking them if you have any suggestions or preference?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how they are different. As long as they are both consistent, either way of wording the flag help is fine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, this is somewhat misleading

The batch size to use when deleting a subset of data from the migrated tables

It makes it sound as if there's some general capability to delete a subset of data from migrated tables. So the other version is preferable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need them to be exactly the same? That would be a little odd as MoveTables cancel is obviously for MoveTables. What about:

go/cmd/vtctldclient/command/vreplication/movetables/movetables.go:      cancel.Flags().Int64Var(&common.CancelOptions.DeleteBatchSize, "delete-batch-size", DefaultDeleteBatchSize, "When cleaning up the migrated data in tables moved as part of a multi-tenant workflow, delete the records in batches of this size.")
go/cmd/vtctldclient/command/vreplication/workflow/workflow.go:  delete.Flags().Int64Var(&deleteOptions.DeleteBatchSize, "delete-batch-size", movetables.DefaultDeleteBatchSize, "When cleaning up the migrated data in tables moved as part of a multi-tenant MoveTables workflow, delete the records in batches of this size.")

The only difference being we note that it's for a multi-tenant MoveTables workflow in the Workflow delete help output.

And I just realized a misspelling in there mult-tenant that I corrected above. :)

common.AddShardSubsetFlag(delete, &baseOptions.Shards)
base.AddCommand(delete)

Expand Down
24 changes: 19 additions & 5 deletions go/test/endtoend/vreplication/multi_tenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,26 @@ func TestMultiTenantSimple(t *testing.T) {

require.Zero(t, len(getKeyspaceRoutingRules(t, vc).Rules))

mt.Create()
confirmKeyspacesRoutedTo(t, sourceKeyspace, "s1", "t1", nil)
validateKeyspaceRoutingRules(t, vc, initialRules)
createFunc := func() {
mt.Create()
confirmKeyspacesRoutedTo(t, sourceKeyspace, "s1", "t1", nil)
validateKeyspaceRoutingRules(t, vc, initialRules)

lastIndex = insertRows(lastIndex, sourceKeyspace)
waitForWorkflowState(t, vc, fmt.Sprintf("%s.%s", targetKeyspace, mt.workflowName), binlogdatapb.VReplicationWorkflowState_Running.String())
lastIndex = insertRows(lastIndex, sourceKeyspace)
waitForWorkflowState(t, vc, fmt.Sprintf("%s.%s", targetKeyspace, mt.workflowName), binlogdatapb.VReplicationWorkflowState_Running.String())
}

t.Run("cancel", func(t *testing.T) {
// First let's test canceling the workflow to ensure that it properly
// cleans up all of the data.
createFunc()
mt.Cancel()
rowCount := getRowCount(t, vtgateConn, fmt.Sprintf("%s.%s", targetKeyspace, "t1"))
require.Zero(t, rowCount)
})

// Create again and run it to completion.
createFunc()

vdiff(t, targetKeyspace, workflowName, defaultCellName, false, true, nil)
mt.SwitchReads()
Expand Down
3 changes: 2 additions & 1 deletion go/test/endtoend/vreplication/wrappers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ func (v VtctldMoveTables) SwitchWrites() {
}

func (v VtctldMoveTables) Cancel() {
v.exec("Cancel")
args := []string{"Cancel", "--delete-batch-size=500"}
v.exec(args...)
}

func (v VtctldMoveTables) Complete() {
Expand Down
Loading
Loading