AliyunContainerService · xychu · Dec 30, 2020 · Dec 31, 2020 · Jan 5, 2021 · May 17, 2021
diff --git a/api/v1alpha1/trainingjob_types.go b/api/v1alpha1/trainingjob_types.go
@@ -49,6 +49,16 @@ type TrainingJobSpec struct {
 	// Defaults to 1.
 	// +optional
 	SlotsPerWorker *int32 `json:"slotsPerWorker,omitempty"`
+
+	// ScalePolicy defines whether to enable auto scale of workers if possible.
+	// +optional
+	ScalePolicy *string `json:"scalePolicy,omitempty"`
+	// AutoScaleIntervalSeconds defines the period of seconds between each auto scale.
+	// +optional
+	AutoScaleIntervalSeconds *int32 `json:"autoScaleIntervalSeconds,omitempty"`
+	// AutoScaleTimeoutSeconds defines the timeout seconds of auto scale.
+	// +optional
+	AutoScaleTimeoutSeconds *int32 `json:"autoScaleTimeoutSeconds,omitempty"`
 }
 
 type ETReplicaSpecs struct {

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/kai.alibabacloud.com_trainingjobs.yaml b/config/crd/bases/kai.alibabacloud.com_trainingjobs.yaml
@@ -43,6 +43,16 @@ spec:
           spec:
             description: TrainingJobSpec defines the desired state of TrainingJob
             properties:
+              autoScaleIntervalSeconds:
+                description: AutoScaleIntervalSeconds defines the period of seconds
+                  between each auto scale.
+                format: int32
+                type: integer
+              autoScaleTimeoutSeconds:
+                description: AutoScaleTimeoutSeconds defines the timeout seconds of
+                  auto scale.
+                format: int32
+                type: integer
               cleanPodPolicy:
                 description: CleanPodPolicy defines the policy that whether to kill
                   pods after the job completes. Defaults to None.
@@ -13132,6 +13142,10 @@ spec:
                 - worker
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
+              scalePolicy:
+                description: ScalePolicy defines whether to enable auto scale of workers
+                  if possible.
+                type: string
               launcherAttachMode:
                 description: Specifies the mode when launcher attach to workers. available
                   option is ssh / kubexec Defaults is kubexec.

diff --git a/pkg/controllers/status.go b/pkg/controllers/status.go
@@ -26,6 +26,8 @@ const (
 
 	// scalingInCreatedReason is added in a scalein when it is created.
 	scalingStartReason = "ScalingStart"
+	// autoScalingException
+	autoScalingException = "TrainingJobAutoScaleFailed"
 )
 
 // initializeTrainingJobStatuses initializes the ReplicaStatuses for TrainingJob.

diff --git a/pkg/controllers/worker.go b/pkg/controllers/worker.go
@@ -3,6 +3,7 @@ package controllers
 import (
 	"context"
 	"fmt"
+	"time"
 
 	kaiv1alpha1 "github.com/AliyunContainerService/et-operator/api/v1alpha1"
 	logger "github.com/sirupsen/logrus"
@@ -56,6 +57,12 @@ func (r *TrainingJobReconciler) syncWorkersState(job *kaiv1alpha1.TrainingJob) e
 
 	r.workerReplicasStatus(job.GetJobStatus(), workers)
 
+	r.handleWorkersAutoScale(job, workers)
+	//err = r.handleWorkersAutoScale(job, workers)
+	//if err != nil {
+	//	return err
+	//}
+
 	err = r.handleWorkersFailed(job, workers)
 	if err != nil {
 		return err
@@ -78,6 +85,80 @@ func (r *TrainingJobReconciler) waitWorkersRunning(job *kaiv1alpha1.TrainingJob)
 	return nil
 }
 
+func (r *TrainingJobReconciler) handleWorkersAutoScale(job *kaiv1alpha1.TrainingJob, pods []corev1.Pod) error {
+	if job.Spec.ScalePolicy == nil || *job.Spec.ScalePolicy != "Auto" {
+		logger.Info("no need to autoscale")
+		return nil
+	}
+	currentWorkers := []string{}
+	maxWorkers := job.Spec.ETReplicaSpecs.Worker.MaxReplicas
+	currentWorkers = job.Status.CurrentWorkers
+
+	if len(currentWorkers) != len(pods) {
+		logger.Info("no need to autoscale when currentworkers not equals to worker pods.")
+		return nil
+	}
+
+	var autoScaleTimeout int32 = 30
+	var autoScaleInterval int32 = 60
+	if job.Spec.AutoScaleIntervalSeconds != nil {
+		autoScaleInterval = *job.Spec.AutoScaleIntervalSeconds
+	}
+	if job.Spec.AutoScaleTimeoutSeconds != nil {
+		autoScaleTimeout = *job.Spec.AutoScaleTimeoutSeconds
+	}
+	scaleCount := *maxWorkers - int32(len(currentWorkers))
+
+	hasScaling := false
+	scalingTimeout := false
+	for _, condition := range job.Status.Conditions {
+		if condition.Type == common.Scaling && condition.Reason == scalingStartReason {
+			hasScaling = true
+			scalingTimeout = condition.LastUpdateTime.Add(time.Duration(autoScaleInterval) * time.Second).Before(time.Now())
+		}
+	}
+	if scaleCount > 0 && (!hasScaling || scalingTimeout) {
+		scaleOut := kaiv1alpha1.ScaleOut{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      job.Name + "-autoscaleout",
+				Namespace: job.Namespace,
+			},
+			Spec: kaiv1alpha1.ScaleOutSpec{
+				Selector: kaiv1alpha1.Selector{Name: job.Name},
+				ScaleScriptSpec: kaiv1alpha1.ScaleScriptSpec{
+					Timeout: &autoScaleTimeout,
+				},
+				ToAdd: &kaiv1alpha1.ToAddSpec{Count: &scaleCount},
+			},
+		}
+
+		msg := fmt.Sprintf("trainingjob(%s/%s): create autoscaleout %++v", job.Namespace, job.Name, scaleOut.Spec)
+		logger.Infof(msg)
+		if err := r.Client.Create(context.Background(), &scaleOut); err != nil {
+			msg := fmt.Sprintf("trainingjob(%s/%s): failed to create autoscale %++v.", job.Namespace, job.Name, scaleOut.Spec)
+			r.recorder.Event(job, corev1.EventTypeWarning, autoScalingException, msg)
+			logger.Infof(msg)
+			return err
+		}
+		// call delete after interval
+		time.AfterFunc(time.Duration(autoScaleInterval)*time.Second, func() {
+			if err := r.Client.Delete(context.Background(), &scaleOut); err != nil {
+				msg := fmt.Sprintf("trainingjob(%s/%s): failed to delete autoscale %++v, err: %s", job.Namespace, job.Name, scaleOut, err)
+				r.recorder.Event(job, corev1.EventTypeWarning, autoScalingException, msg)
+				logger.Infof(msg)
+			}
+			msg := fmt.Sprintf("trainingjob(%s/%s): delete autoscaleout %++v", job.Namespace, job.Name, scaleOut.Spec)
+			logger.Infof(msg)
+		})
+
+		r.recorder.Event(job, corev1.EventTypeNormal, scalingStartReason, msg)
+		updateJobConditions(job.GetJobStatus(), common.Scaling, scalingStartReason, msg)
+		return nil
+	}
+
+	return nil
+}
+
 func (r *TrainingJobReconciler) handleWorkersFailed(job *kaiv1alpha1.TrainingJob, pods []corev1.Pod) error {
 	lastRunningPods := map[string]bool{}
 	for _, worker := range job.Status.CurrentWorkers {