Skip to content

Commit

Permalink
CDPCP-12129 - introduce failure tolerance during resource deletion
Browse files Browse the repository at this point in the history
  • Loading branch information
gregito committed Aug 21, 2024
1 parent e2251c7 commit a310aa1
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 3 deletions.
2 changes: 1 addition & 1 deletion resources/datahub/polling.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func describeWithRecover(clusterName string, client *client.Datahub, ctx context
resp, err := client.Operations.DescribeCluster(operations.NewDescribeClusterParamsWithContext(ctx).WithInput(&datahubmodels.DescribeClusterRequest{ClusterName: &clusterName}))
for i := 0; i < internalServerErrorRetryQuantity; i++ {
if err != nil {
if isInternalServerError(err) {
if isInternalServerError(err) || isTimeoutError(err) {
tflog.Debug(ctx, fmt.Sprintf("Cluster describe came back with internal server error. "+
"About to (#%d.) re-attempt to describe cluster '%s'.", i+1, clusterName))
resp, err = client.Operations.DescribeCluster(operations.NewDescribeClusterParamsWithContext(ctx).WithInput(&datahubmodels.DescribeClusterRequest{ClusterName: &clusterName}))
Expand Down
7 changes: 7 additions & 0 deletions resources/datahub/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,10 @@ func isInternalServerError(err error) bool {
}
return false
}

func isTimeoutError(err error) bool {
if d, ok := err.(*operations.DescribeClusterDefault); ok && d.GetPayload() != nil {
return d.GetPayload().Code == "TIMEOUT"
}
return false
}
10 changes: 10 additions & 0 deletions resources/datalake/resource_aws_datalake.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,11 @@ func waitForDatalakeToBeDeleted(ctx context.Context, datalakeName string, fallba
if err != nil {
return err
}
failureThreshold, failureThresholdErr := utils.CalculateCallFailureThresholdOrDefault(ctx, options, callFailureThreshold)
if failureThresholdErr != nil {
return failureThresholdErr
}
callFailedCount := 0
stateConf := &retry.StateChangeConf{
Pending: []string{"DELETE_REQUESTED", "STACK_DELETION_IN_PROGRESS", "STACK_DELETED", "EXTERNAL_DATABASE_DELETION_IN_PROGRESS", "DELETED"},
Target: []string{},
Expand All @@ -376,6 +381,11 @@ func waitForDatalakeToBeDeleted(ctx context.Context, datalakeName string, fallba
return nil, "", nil
}
}
callFailedCount++
if callFailedCount <= failureThreshold {
tflog.Warn(ctx, fmt.Sprintf("Error describing datalake with call failure due to [%s] but threshold limit is not reached yet (%d out of %d).", err.Error(), callFailedCount, callFailureThreshold))
return nil, "", nil
}
return nil, "", err
}
if resp.GetPayload().Datalake == nil {
Expand Down
2 changes: 1 addition & 1 deletion resources/environments/environment_action_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func deleteEnvironmentWithDiagnosticHandle(environmentName string, ctx context.C
if pollingOptions != nil && pollingOptions.Async.ValueBool() {
return nil
}
err = waitForEnvironmentToBeDeleted(environmentName, timeoutOneHour, client.Environments, ctx, pollingOptions)
err = waitForEnvironmentToBeDeleted(environmentName, timeoutOneHour, callFailureThreshold, client.Environments, ctx, pollingOptions)
if err != nil {
utils.AddEnvironmentDiagnosticsError(err, &resp.Diagnostics, "delete Environment")
return err
Expand Down
13 changes: 12 additions & 1 deletion resources/environments/polling.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,13 @@ import (
"github.com/cloudera/terraform-provider-cdp/utils"
)

func waitForEnvironmentToBeDeleted(environmentName string, fallbackTimeout time.Duration, client *client.Environments, ctx context.Context, options *utils.PollingOptions) error {
func waitForEnvironmentToBeDeleted(environmentName string, fallbackTimeout time.Duration, callFailureThresholdDefault int, client *client.Environments, ctx context.Context, options *utils.PollingOptions) error {
timeout, err := utils.CalculateTimeoutOrDefault(ctx, options, fallbackTimeout)
callFailureThreshold, failureThresholdError := utils.CalculateCallFailureThresholdOrDefault(ctx, options, callFailureThresholdDefault)
if failureThresholdError != nil {
return failureThresholdError
}
callFailedCount := 0
if err != nil {
return err
}
Expand Down Expand Up @@ -66,6 +71,12 @@ func waitForEnvironmentToBeDeleted(environmentName string, fallbackTimeout time.
return nil, "", nil
}
}
callFailedCount++
if callFailedCount <= callFailureThreshold {
tflog.Warn(ctx, fmt.Sprintf("Error describing environment with call failure due to [%s] but threshold limit is not reached yet (%d out of %d).", err.Error(), callFailedCount, callFailureThreshold))
return nil, "", nil
}
tflog.Error(ctx, fmt.Sprintf("Error describing environment (due to: %s) and call failure threshold limit exceeded.", err))
return nil, "", err
}
if resp.GetPayload().Environment == nil {
Expand Down

0 comments on commit a310aa1

Please sign in to comment.