From c20971357fb06bf33bd5ea7dfe2ab3ef6338ecba Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Fri, 8 Nov 2024 16:54:38 -0800 Subject: [PATCH 1/2] =?UTF-8?q?azure:=20don=E2=80=99t=20eagerly=20update?= =?UTF-8?q?=20vmss=20cache=20before=20delete=20success?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jack Francis --- .../cloudprovider/azure/azure_scale_set.go | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index ad109425a178..489c555c4e15 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -530,21 +530,6 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered return rerr.Error() } - // Proactively decrement scale set size so that we don't - // go below minimum node count if cache data is stale - // only do it for non-unregistered nodes - if !hasUnregisteredNodes { - scaleSet.sizeMutex.Lock() - scaleSet.curSize -= int64(len(instanceIDs)) - scaleSet.lastSizeRefresh = time.Now() - scaleSet.sizeMutex.Unlock() - } - - // Proactively set the status of the instances to be deleted in cache - for _, instance := range instancesToDelete { - scaleSet.setInstanceStatusByProviderID(instance.Name, cloudprovider.InstanceStatus{State: cloudprovider.InstanceDeleting}) - } - go scaleSet.waitForDeleteInstances(future, requiredIds) return nil } @@ -558,11 +543,9 @@ func (scaleSet *ScaleSet) waitForDeleteInstances(future *azure.Future, requiredI isSuccess, err := isSuccessHTTPResponse(httpResponse, err) if isSuccess { klog.V(3).Infof(".WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name) - // No need to invalidateInstanceCache because instanceStates were proactively set to "deleting" + scaleSet.invalidateInstanceCache() return } - // On failure, invalidate the instanceCache - cannot have instances in deletingState - scaleSet.invalidateInstanceCache() klog.Errorf("WaitForDeleteInstancesResult(%v) for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err) } From 1e5ed185d7b9b789b3a3bf4080e2e6e384cc50c0 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Sun, 10 Nov 2024 20:47:22 -0800 Subject: [PATCH 2/2] restore original behavior Signed-off-by: Jack Francis --- .../cloudprovider/azure/azure_config.go | 7 +++++ .../cloudprovider/azure/azure_scale_set.go | 29 ++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 29e91999db73..3b98c7286468 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -94,6 +94,9 @@ type Config struct { // (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"` + + // StrictCacheUpdates updates cache values only after positive validation from Azure APIs + StrictCacheUpdates bool `json:"strictCacheUpdates,omitempty" yaml:"strictCacheUpdates,omitempty"` } // These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name. @@ -122,6 +125,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg.CloudProviderBackoffJitter = providerazureconsts.BackoffJitterDefault cfg.VMType = providerazureconsts.VMTypeVMSS cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount) + cfg.StrictCacheUpdates = false // Config file overrides defaults if configReader != nil { @@ -247,6 +251,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { if _, err = assignBoolFromEnvIfExists(&cfg.EnableForceDelete, "AZURE_ENABLE_FORCE_DELETE"); err != nil { return nil, err } + if _, err = assignBoolFromEnvIfExists(&cfg.StrictCacheUpdates, "AZURE_STRICT_CACHE_UPDATES"); err != nil { + return nil, err + } if _, err = assignBoolFromEnvIfExists(&cfg.EnableDynamicInstanceList, "AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 489c555c4e15..c4399ea45b37 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -530,6 +530,24 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered return rerr.Error() } + if !scaleSet.manager.config.StrictCacheUpdates { + // Proactively decrement scale set size so that we don't + // go below minimum node count if cache data is stale + // only do it for non-unregistered nodes + + if !hasUnregisteredNodes { + scaleSet.sizeMutex.Lock() + scaleSet.curSize -= int64(len(instanceIDs)) + scaleSet.lastSizeRefresh = time.Now() + scaleSet.sizeMutex.Unlock() + } + + // Proactively set the status of the instances to be deleted in cache + for _, instance := range instancesToDelete { + scaleSet.setInstanceStatusByProviderID(instance.Name, cloudprovider.InstanceStatus{State: cloudprovider.InstanceDeleting}) + } + } + go scaleSet.waitForDeleteInstances(future, requiredIds) return nil } @@ -543,9 +561,18 @@ func (scaleSet *ScaleSet) waitForDeleteInstances(future *azure.Future, requiredI isSuccess, err := isSuccessHTTPResponse(httpResponse, err) if isSuccess { klog.V(3).Infof(".WaitForDeleteInstancesResult(%v) for %s success", requiredIds.InstanceIds, scaleSet.Name) - scaleSet.invalidateInstanceCache() + if scaleSet.manager.config.StrictCacheUpdates { + if err := scaleSet.manager.forceRefresh(); err != nil { + klog.Errorf("forceRefresh failed with error: %v", err) + } + scaleSet.invalidateInstanceCache() + } return } + if !scaleSet.manager.config.StrictCacheUpdates { + // On failure, invalidate the instanceCache - cannot have instances in deletingState + scaleSet.invalidateInstanceCache() + } klog.Errorf("WaitForDeleteInstancesResult(%v) for %s failed with error: %v", requiredIds.InstanceIds, scaleSet.Name, err) }