-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix silent failures in dispatch loop from stalling the pipeline #32922
base: master
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,15 +23,17 @@ | |
import com.google.auto.value.AutoOneOf; | ||
import java.util.Collections; | ||
import java.util.Optional; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.Future; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.atomic.AtomicBoolean; | ||
import java.util.function.Function; | ||
import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; | ||
import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; | ||
import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; | ||
import org.apache.beam.runners.dataflow.worker.streaming.Work; | ||
import org.apache.beam.runners.dataflow.worker.util.TerminatingExecutors; | ||
import org.apache.beam.runners.dataflow.worker.windmill.Windmill; | ||
import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.RpcException; | ||
import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; | ||
|
@@ -82,12 +84,13 @@ public final class SingleSourceWorkerHarness implements StreamingWorkerHarness { | |
this.waitForResources = waitForResources; | ||
this.computationStateFetcher = computationStateFetcher; | ||
this.workProviderExecutor = | ||
Executors.newSingleThreadScheduledExecutor( | ||
TerminatingExecutors.newSingleThreadedExecutor( | ||
new ThreadFactoryBuilder() | ||
.setDaemon(true) | ||
.setPriority(Thread.MIN_PRIORITY) | ||
.setNameFormat("DispatchThread") | ||
.build()); | ||
.setNameFormat("DispatchThread"), | ||
LOG); | ||
|
||
this.isRunning = new AtomicBoolean(false); | ||
this.getWorkSender = getWorkSender; | ||
} | ||
|
@@ -103,11 +106,22 @@ public void start() { | |
"Multiple calls to {}.start() are not allowed.", | ||
getClass()); | ||
workCommitter.start(); | ||
workProviderExecutor.execute( | ||
() -> { | ||
getDispatchLoop().run(); | ||
LOG.info("Dispatch done"); | ||
}); | ||
while (isRunning.get()) { | ||
Future<?> dispatchLoopFuture = | ||
workProviderExecutor.submit( | ||
() -> { | ||
getDispatchLoop().run(); | ||
LOG.info("Dispatch done"); | ||
}); | ||
|
||
try { | ||
dispatchLoopFuture.get(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is changing start() to be blocking. If we want that we shoudl have clearer method name and also just get rid of the workProviderExecutor and use this thread directly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
} catch (InterruptedException e) { | ||
Thread.currentThread().interrupt(); | ||
} catch (ExecutionException e) { | ||
throw new AssertionError("GetWork failed with error.", e); | ||
} | ||
} | ||
} | ||
|
||
private Runnable getDispatchLoop() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,6 @@ | |
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.ScheduledExecutorService; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.atomic.AtomicInteger; | ||
|
@@ -51,6 +50,7 @@ | |
import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; | ||
import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; | ||
import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; | ||
import org.apache.beam.runners.dataflow.worker.util.TerminatingExecutors; | ||
import org.apache.beam.runners.dataflow.worker.windmill.work.processing.failures.FailureTracker; | ||
import org.apache.beam.sdk.annotations.Internal; | ||
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; | ||
|
@@ -151,8 +151,8 @@ public static StreamingWorkerStatusReporter create( | |
memoryMonitor, | ||
workExecutor, | ||
threadName -> | ||
Executors.newSingleThreadScheduledExecutor( | ||
new ThreadFactoryBuilder().setNameFormat(threadName).build()), | ||
TerminatingExecutors.newSingleThreadedScheduledExecutor( | ||
new ThreadFactoryBuilder().setNameFormat(threadName), LOG), | ||
windmillHarnessUpdateReportingPeriodMillis, | ||
perWorkerMetricsUpdateReportingPeriodMillis); | ||
} | ||
|
@@ -228,6 +228,22 @@ private static void shutdownExecutor(ScheduledExecutorService executor) { | |
} | ||
} | ||
|
||
// Calculates the PerWorkerMetrics reporting frequency, ensuring alignment with the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we revert the unrelated change, to keep the diff small and on topic? |
||
// WorkerMessages RPC schedule. The desired reporting period | ||
// (perWorkerMetricsUpdateReportingPeriodMillis) is adjusted to the nearest multiple | ||
// of the RPC interval (windmillHarnessUpdateReportingPeriodMillis). | ||
private static long getPerWorkerMetricsUpdateFrequency( | ||
long windmillHarnessUpdateReportingPeriodMillis, | ||
long perWorkerMetricsUpdateReportingPeriodMillis) { | ||
if (windmillHarnessUpdateReportingPeriodMillis == 0) { | ||
return 0; | ||
} | ||
return LongMath.divide( | ||
perWorkerMetricsUpdateReportingPeriodMillis, | ||
windmillHarnessUpdateReportingPeriodMillis, | ||
RoundingMode.CEILING); | ||
} | ||
|
||
@SuppressWarnings("FutureReturnValueIgnored") | ||
public void start() { | ||
reportHarnessStartup(); | ||
|
@@ -276,22 +292,6 @@ private void reportHarnessStartup() { | |
} | ||
} | ||
|
||
// Calculates the PerWorkerMetrics reporting frequency, ensuring alignment with the | ||
// WorkerMessages RPC schedule. The desired reporting period | ||
// (perWorkerMetricsUpdateReportingPeriodMillis) is adjusted to the nearest multiple | ||
// of the RPC interval (windmillHarnessUpdateReportingPeriodMillis). | ||
private static long getPerWorkerMetricsUpdateFrequency( | ||
long windmillHarnessUpdateReportingPeriodMillis, | ||
long perWorkerMetricsUpdateReportingPeriodMillis) { | ||
if (windmillHarnessUpdateReportingPeriodMillis == 0) { | ||
return 0; | ||
} | ||
return LongMath.divide( | ||
perWorkerMetricsUpdateReportingPeriodMillis, | ||
windmillHarnessUpdateReportingPeriodMillis, | ||
RoundingMode.CEILING); | ||
} | ||
|
||
/** Sends counter updates to Dataflow backend. */ | ||
private void sendWorkerUpdatesToDataflowService( | ||
CounterSet deltaCounters, CounterSet cumulativeCounters) throws IOException { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.beam.runners.dataflow.worker.util; | ||
|
||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.ScheduledExecutorService; | ||
import java.util.concurrent.ThreadFactory; | ||
import org.apache.beam.runners.dataflow.worker.WorkerUncaughtExceptionHandler; | ||
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; | ||
import org.slf4j.Logger; | ||
|
||
/** | ||
* Utility class for {@link java.util.concurrent.ExecutorService}s that will terminate the JVM on | ||
* uncaught exceptions. | ||
* | ||
* @implNote Ensures that all threads produced by the {@link ExecutorService}s have a {@link | ||
* WorkerUncaughtExceptionHandler} attached to prevent hidden/silent exceptions and errors. | ||
*/ | ||
public final class TerminatingExecutors { | ||
private TerminatingExecutors() {} | ||
|
||
public static ExecutorService newSingleThreadedExecutor( | ||
ThreadFactoryBuilder threadFactoryBuilder, Logger logger) { | ||
return Executors.newSingleThreadExecutor( | ||
terminatingThreadFactory(threadFactoryBuilder, logger)); | ||
} | ||
|
||
public static ScheduledExecutorService newSingleThreadedScheduledExecutor( | ||
ThreadFactoryBuilder threadFactoryBuilder, Logger logger) { | ||
return Executors.newSingleThreadScheduledExecutor( | ||
terminatingThreadFactory(threadFactoryBuilder, logger)); | ||
} | ||
|
||
public static ExecutorService newCachedThreadPool( | ||
ThreadFactoryBuilder threadFactoryBuilder, Logger logger) { | ||
return Executors.newCachedThreadPool(terminatingThreadFactory(threadFactoryBuilder, logger)); | ||
} | ||
|
||
public static ExecutorService newFixedThreadPool( | ||
int numThreads, ThreadFactoryBuilder threadFactoryBuilder, Logger logger) { | ||
return Executors.newFixedThreadPool( | ||
numThreads, terminatingThreadFactory(threadFactoryBuilder, logger)); | ||
} | ||
|
||
private static ThreadFactory terminatingThreadFactory( | ||
ThreadFactoryBuilder threadFactoryBuilder, Logger logger) { | ||
return threadFactoryBuilder | ||
.setUncaughtExceptionHandler(new WorkerUncaughtExceptionHandler(logger)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure this is working if the executor thread is catchign the exception and putting it in the future. I think we might need instead to wrap the executor so that scheduled tasks are wrapped and then catch exceptions and call termination method like com/google/common/util/concurrent/WrappingExecutorService.java There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added to test to ensure that we are halting the JVM |
||
.build(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we revert the unrelated change, to keep the diff small and on topic?