Skip to content
This repository has been archived by the owner on Oct 23, 2024. It is now read-only.

Commit

Permalink
[DCOS-54275] Add backoff delay for failed/error tasks (#3120)
Browse files Browse the repository at this point in the history
* Code level inspection of what's needed for Task Backoff.

* suppress when everything is delayed

* Add a sanityintegration test for backoff delay

* Enable backoff for hw by default and run tests

* Remove subclass instanceof checks in AbstractStep class

* Add more filtering on essential vs non essential tasks for recovery plan


Co-authored-by: Kaiwalya Joshi <[email protected]>
  • Loading branch information
takirala and kaiwalyajoshi authored Jul 16, 2019
1 parent 1fadd2e commit 2e8c448
Show file tree
Hide file tree
Showing 62 changed files with 969 additions and 346 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.mesosphere.sdk.cassandra.scheduler;

import com.mesosphere.sdk.offer.CommonIdUtils;
import com.mesosphere.sdk.scheduler.plan.DefaultPhase;
import com.mesosphere.sdk.scheduler.plan.DefaultPodInstance;
import com.mesosphere.sdk.scheduler.plan.Phase;
Expand Down Expand Up @@ -85,7 +86,7 @@ private Phase getNodeRecoveryPhase(Plan inputPlan, int index) {
// Get IP address for the pre-existing node.

Optional<Protos.TaskStatus> status = StateStoreUtils.getTaskStatusFromProperty(
stateStore, TaskSpec.getInstanceName(podInstance, taskSpec));
stateStore, CommonIdUtils.getTaskInstanceName(podInstance, taskSpec));
if (!status.isPresent()) {
logger.error("No previously stored TaskStatus to pull IP address from in Cassandra recovery");
return null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.mesosphere.sdk.cassandra.scheduler;

import com.mesosphere.sdk.scheduler.plan.DefaultPodInstance;
import com.mesosphere.sdk.scheduler.plan.DefaultStepFactory;
import com.mesosphere.sdk.scheduler.plan.Phase;
import com.mesosphere.sdk.scheduler.plan.Plan;
import com.mesosphere.sdk.scheduler.plan.PodInstanceRequirement;
Expand Down Expand Up @@ -136,7 +137,7 @@ private PodInstanceRequirement getPodInstanceRequirement(int nodeIndex, Recovery
}

private Plan getReplacePlan(ConfigStore<ServiceSpec> configStore) throws Exception {
return new PlanGenerator(configStore, stateStore, Optional.empty())
return new PlanGenerator(new DefaultStepFactory(configStore, stateStore, Optional.empty()))
.generate(rawSpec.getPlans().get("replace"), "replace", serviceSpec.getPods());
}
}
4 changes: 3 additions & 1 deletion frameworks/helloworld/src/main/dist/crash-loop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ pods:
tasks:
server:
goal: RUNNING
cmd: echo crash-loop && sleep 60 && exit 1
cmd: "echo crash-loop && sleep $SLEEP_DURATION && exit 1"
cpus: {{HELLO_CPUS}}
memory: {{HELLO_MEM}}
env:
SLEEP_DURATION: {{SLEEP_DURATION}}
readiness-check:
cmd: exit 1
interval: 5
Expand Down
71 changes: 71 additions & 0 deletions frameworks/helloworld/tests/test_backoff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
import pytest

import sdk_install
import sdk_metrics
import sdk_plan
import sdk_utils
from tests import config

log = logging.getLogger(__name__)
foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)


@pytest.fixture(scope="function", autouse=True)
def configure_package(configure_security):
try:
sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
yield
finally:
sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)


back_off_crash_loop_options = {
"service": {"yaml": "crash-loop", "sleep": 60, "task_failure_backoff": {"enabled": True}}
}


@pytest.mark.sanity
def test_default_plan_backoff():
sdk_install.install(
config.PACKAGE_NAME,
foldered_name,
expected_running_tasks=0,
additional_options=back_off_crash_loop_options,
wait_for_deployment=False,
wait_for_all_conditions=False,
)
# State transition should be STARTING -> STARTED -> DELAYED in a loop.
# As STARTING lasts for a very short duration, we test the switch between other two states.
check_delayed_and_suppressed("deploy")
sdk_plan.wait_for_plan_status(foldered_name, "deploy", "STARTED")
check_delayed_and_suppressed("deploy")
# We can't make further progress, this is the end of the test.


@pytest.mark.sanity
def test_recovery_backoff():
sdk_install.install(
config.PACKAGE_NAME,
foldered_name,
expected_running_tasks=0,
additional_options=back_off_crash_loop_options,
wait_for_deployment=False,
wait_for_all_conditions=False,
)
check_delayed_and_suppressed("deploy")
sdk_plan.force_complete_step(foldered_name, "deploy", "crash", "hello-0:[server]")
# Deploy plan is complete. Recovery plan should take over. Recovery plan is in COMPLETE
# by default, it should go from STARTED -> DELAYED.
sdk_plan.wait_for_plan_status(foldered_name, "recovery", "STARTED")
check_delayed_and_suppressed("recovery")
sdk_plan.wait_for_plan_status(foldered_name, "recovery", "STARTED")


def check_delayed_and_suppressed(plan_name: str):
sdk_plan.wait_for_plan_status(foldered_name, plan_name, "DELAYED")
sdk_metrics.wait_for_scheduler_gauge_value(
foldered_name,
"is_suppressed",
lambda result: isinstance(result, bool) and bool(result), # Should be set to true.
)
8 changes: 7 additions & 1 deletion frameworks/helloworld/tests/test_fast_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ def test_finish_install_on_failure():
foldered_name,
1,
additional_options={
"service": {"name": foldered_name, "yaml": "non_recoverable_state"}
"service": {
"name": foldered_name,
"yaml": "non_recoverable_state",
"task_failure_backoff": {
"enabled": False # crash loop very fast without any delay
},
}
},
)
sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
29 changes: 29 additions & 0 deletions frameworks/helloworld/universe/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,35 @@
"enum": [0, 1],
"default": 1
},
"task_failure_backoff": {
"description": "Configuration for back off of failed/error tasks",
"type": "object",
"properties" : {
"enabled": {
"description": "Enable backoff delay",
"type": "boolean",
"default": true
},
"backoff_factor": {
"description": "The rate of advancing the backoff delay on a failing task",
"type": "number",
"default": 1.15
},
"initial_backoff": {
"description": "Initial backoff delay for a task that failed for the first time (in seconds)",
"type": "integer",
"default": 60
},
"max_launch_delay": {
"description": "Maximum launch delay for any task (in seconds)",
"type": "integer",
"default": 300
}
},
"required": [
"enabled"
]
},
"check" : {
"description": "Checks are a way to understand the health of a service. These HTTP checks are intended to be consumed by services external to Mesos/Marathon which will take appropriate action on them.",
"type": "object",
Expand Down
6 changes: 6 additions & 0 deletions frameworks/helloworld/universe/marathon.json.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@
"MESOS_AUTHENTICATEE": "com_mesosphere_dcos_ClassicRPCAuthenticatee",
"MESOS_HTTP_AUTHENTICATEE": "com_mesosphere_dcos_http_Authenticatee",
{{/service.service_account_secret}}
{{#service.task_failure_backoff.enabled}}
"ENABLE_BACKOFF": "{{service.task_failure_backoff.enabled}}",
"FRAMEWORK_BACKOFF_FACTOR": "{{service.task_failure_backoff.backoff_factor}}",
"FRAMEWORK_INITIAL_BACKOFF": "{{service.task_failure_backoff.initial_backoff}}",
"FRAMEWORK_MAX_LAUNCH_DELAY": "{{service.task_failure_backoff.max_launch_delay}}",
{{/service.task_failure_backoff.enabled}}
{{#hello.secret1}}
"HELLO_SECRET1" : "{{hello.secret1}}",
{{/hello.secret1}}
Expand Down
5 changes: 1 addition & 4 deletions gradle/checkstyle/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -435,10 +435,7 @@ page at http://checkstyle.sourceforge.net/config.html -->
<property name="max" value="42"/>
</module>
<module name="LineLength">
<property name="max" value="100"/>
<!--TODO(takirala): We should not ignore line length for javadoc & comments-->
<property name="ignorePattern"
value="^(package .*;\s*)|(import .*;\s*)|( *\* *https?://.*)|(@.*)|(\* .*)|(\/\/ .*)$"/>
<property name="max" value="120"/>
</module>
<module name="MethodCount">
<property name="maxPublic" value="16"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.mesosphere.sdk.http.ResponseUtils;
import com.mesosphere.sdk.offer.Constants;
import com.mesosphere.sdk.scheduler.plan.Element;
import com.mesosphere.sdk.scheduler.plan.ParentElement;
import com.mesosphere.sdk.scheduler.plan.Phase;
import com.mesosphere.sdk.scheduler.plan.Plan;
import com.mesosphere.sdk.scheduler.plan.PlanCoordinator;
Expand Down Expand Up @@ -255,7 +257,7 @@ private SerializePlan generatePlanServiceStatus(Plan plan,

int completedSteps = plan.getChildren().stream()
.flatMap(phase -> phase.getChildren().stream())
.filter(step -> step.isComplete())
.filter(Element::isComplete)
.collect(Collectors.toSet())
.size();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.mesosphere.sdk.debug;

import com.mesosphere.sdk.http.ResponseUtils;
import com.mesosphere.sdk.offer.CommonIdUtils;
import com.mesosphere.sdk.scheduler.plan.Phase;
import com.mesosphere.sdk.scheduler.plan.Plan;
import com.mesosphere.sdk.scheduler.plan.PlanCoordinator;
Expand Down Expand Up @@ -70,7 +71,7 @@ public List<PlanResponse> getTaskStatuses(String filterPlan,
.findFirst()
.get();
TaskStatusResponse taskStatusObject = new TaskStatusResponse();
String taskInstanceName = TaskSpec.getInstanceName(
String taskInstanceName = CommonIdUtils.getTaskInstanceName(
step.getPodInstanceRequirement().get().getPodInstance(),
taskSpec.getName()
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ private void processQueuedOffers(Duration queueWait) {
synchronized (inProgressLock) {
offersInProgress.removeAll(
offers.stream()
.map(offer -> offer.getId())
.map(Protos.Offer::getId)
.collect(Collectors.toList()));
if (!offers.isEmpty()) {
LOGGER.info("Processed {} queued offer{}. {} {} in progress: {}",
Expand Down
Loading

0 comments on commit 2e8c448

Please sign in to comment.