-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VReplication: Address SwitchWrites bugs around replication lag and cancel on error #17616
base: main
Are you sure you want to change the base?
Changes from all commits
084ace8
fbf4334
4e8e109
9fb48f7
433d7ff
a7f400c
3964c7c
49c1105
a239dc6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -446,35 +446,11 @@ func (wf *workflowFetcher) scanWorkflow( | |
workflow.WorkflowSubType = res.WorkflowSubType.String() | ||
workflow.DeferSecondaryKeys = res.DeferSecondaryKeys | ||
|
||
// MaxVReplicationTransactionLag estimates the actual statement processing lag | ||
// between the source and the target. If we are still processing source events it | ||
// is the difference b/w current time and the timestamp of the last event. If | ||
// heartbeats are more recent than the last event, then the lag is the time since | ||
// the last heartbeat as there can be an actual event immediately after the | ||
// heartbeat, but which has not yet been processed on the target. | ||
// We don't allow switching during the copy phase, so in that case we just return | ||
// a large lag. All timestamps are in seconds since epoch. | ||
if rstream.TransactionTimestamp == nil { | ||
rstream.TransactionTimestamp = &vttimepb.Time{} | ||
} | ||
lastTransactionTime := rstream.TransactionTimestamp.Seconds | ||
if rstream.TimeHeartbeat == nil { | ||
rstream.TimeHeartbeat = &vttimepb.Time{} | ||
} | ||
lastHeartbeatTime := rstream.TimeHeartbeat.Seconds | ||
if stream.State == binlogdatapb.VReplicationWorkflowState_Copying.String() { | ||
meta.maxVReplicationTransactionLag = math.MaxInt64 | ||
} else { | ||
if lastTransactionTime == 0 /* no new events after copy */ || | ||
lastHeartbeatTime > lastTransactionTime /* no recent transactions, so all caught up */ { | ||
|
||
lastTransactionTime = lastHeartbeatTime | ||
} | ||
now := time.Now().Unix() /* seconds since epoch */ | ||
transactionReplicationLag := float64(now - lastTransactionTime) | ||
if transactionReplicationLag > meta.maxVReplicationTransactionLag { | ||
meta.maxVReplicationTransactionLag = transactionReplicationLag | ||
} | ||
// MaxVReplicationTransactionLag estimates the max statement processing lag | ||
// between the source and the target across all of the workflow streams. | ||
transactionReplicationLag := getVReplicationTrxLag(rstream.TransactionTimestamp, rstream.TimeUpdated, rstream.State) | ||
if transactionReplicationLag > meta.maxVReplicationTransactionLag { | ||
meta.maxVReplicationTransactionLag = transactionReplicationLag | ||
} | ||
} | ||
|
||
|
@@ -670,3 +646,32 @@ func getStreamState(stream *vtctldatapb.Workflow_Stream, rstream *tabletmanagerd | |
} | ||
return rstream.State.String() | ||
} | ||
|
||
// getVReplicationTrxLag estimates the actual statement processing lag between the | ||
// source and the target. If we are still processing source events it is the | ||
// difference between current time and the timestamp of the last event. If | ||
// heartbeats are more recent than the last event, then the lag is the time since | ||
// the last heartbeat as there can be an actual event immediately after the | ||
// heartbeat, but which has not yet been processed on the target. We don't allow | ||
// switching during the copy phase, so in that case we just return a large lag. | ||
// All timestamps are in seconds since epoch. | ||
func getVReplicationTrxLag(trxTs, updatedTs *vttimepb.Time, state binlogdatapb.VReplicationWorkflowState) float64 { | ||
if trxTs == nil { | ||
trxTs = &vttimepb.Time{} | ||
} | ||
lastTransactionTime := trxTs.Seconds | ||
if updatedTs == nil { | ||
updatedTs = &vttimepb.Time{} | ||
} | ||
lastUpdateTime := updatedTs.Seconds | ||
if state == binlogdatapb.VReplicationWorkflowState_Copying { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe worth returning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole thing needs to be revisited, see the max_v_replication_transaction_lag note in the issue: #17620 |
||
return math.MaxInt64 | ||
} | ||
if state == binlogdatapb.VReplicationWorkflowState_Running && // We could be in the ERROR state | ||
(lastTransactionTime == 0 /* No new events after copy */ || | ||
lastUpdateTime > lastTransactionTime /* No recent transactions, so all caught up */) { | ||
lastTransactionTime = lastUpdateTime | ||
} | ||
now := time.Now().Unix() // Seconds since epoch | ||
return float64(now - lastTransactionTime) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's extract this into a separate method.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It depends on the state there so I don't really see the value in moving it out and passing all of those things in (source and target connections, queries, etc.) as it will still be dependent on the state setup in shardCustomer.