Options for parallelization (#813)

mlr-org · Jul 20, 2022 · a791a21 · a791a21
1 parent 2dcea2f
commit a791a21
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 46 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mlr3
 Title: Machine Learning in R - Next Generation
-Version: 0.13.3-9000
+Version: 0.13.4
 Authors@R:
     c(person(given = "Michel",
              family = "Lang",

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,7 @@
 * Added `head()` and `tail()` methods for `Task`.
 * Improved printing of multiple objects.
 
+
 # mlr3 0.13.3
 
 * Most objects now have a new (optional) field `label`, i.e. `Task`,

diff --git a/R/benchmark.R b/R/benchmark.R
@@ -158,27 +158,10 @@ benchmark = function(design, store_models = FALSE, store_backends = TRUE, encaps
     set(grid, j = "mode", value = hotstart_grid$mode)
   }
 
-  if (getOption("mlr3.debug", FALSE)) {
-    lg$info("Running benchmark() sequentially in debug mode with %i iterations", n)
-
-    res = mapply(workhorse,
-      task = grid$task, learner = grid$learner, resampling = grid$resampling, iteration = grid$iteration,
-      mode = grid$mode,
-      MoreArgs = list(store_models = store_models, lgr_threshold = lgr_threshold, pb = pb),
-      SIMPLIFY = FALSE, USE.NAMES = FALSE
-    )
-  } else {
-    lg$debug("Running benchmark() via future with %i iterations", n)
-
-    res = future.apply::future_mapply(workhorse,
-      task = grid$task, learner = grid$learner, resampling = grid$resampling, iteration = grid$iteration,
-      mode = grid$mode,
-      MoreArgs = list(store_models = store_models, lgr_threshold = lgr_threshold, pb = pb),
-      SIMPLIFY = FALSE, USE.NAMES = FALSE, future.globals = FALSE,
-      future.scheduling = structure(TRUE, ordering = "random"), future.packages = "mlr3", future.seed = TRUE,
-      future.stdout = future_stdout()
-    )
-  }
+  res = future_map(n, workhorse,
+    task = grid$task, learner = grid$learner, resampling = grid$resampling, iteration = grid$iteration, mode = grid$mode,
+    MoreArgs = list(store_models = store_models, lgr_threshold = lgr_threshold, pb = pb)
+  )
 
   grid = insert_named(grid, list(
     learner_state = map(res, "learner_state"),

diff --git a/R/helper_exec.R b/R/helper_exec.R
@@ -22,10 +22,20 @@ set_encapsulation = function(learners, encapsulate) {
   learners
 }
 
-future_stdout = function() {
-  if (inherits(plan(), "sequential")) {
-    NA
+future_map = function(n, FUN, ..., MoreArgs = list()) {
+  if (getOption("mlr3.debug", FALSE)) {
+    lg$info("Running experiments sequentially in debug mode with %i iterations", n)
+    mapply(FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE)
   } else {
-    TRUE
+    is_sequential = inherits(plan(), "sequential")
+    scheduling = if (!is_sequential && isTRUE(getOption("mlr3.exec_random", TRUE))) structure(TRUE, ordering = "random") else TRUE
+    chunk_size = getOption("mlr3.exec_chunk_size", 1)
+    stdout = if (is_sequential) NA else TRUE
+
+    lg$debug("Running resample() via future with %n iterations", n)
+    future.apply::future_mapply(
+      FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE, USE.NAMES = FALSE,
+      future.globals = FALSE, future.packages = "mlr3", future.seed = TRUE,
+      future.scheduling = scheduling, future.chunk.size = chunk_size, future.stdout = stdout)
   }
 }
diff --git a/R/resample.R b/R/resample.R
@@ -105,24 +105,9 @@ resample = function(task, learner, resampling, store_models = FALSE, store_backe
     data.table(learner = replicate(n, learner), mode = "train")
   }
 
-  if (getOption("mlr3.debug", FALSE)) {
-    lg$info("Running resample() sequentially in debug mode with %i iterations", n)
-    res = mapply(workhorse,
-      iteration = seq_len(n), learner = grid$learner, mode = grid$mode,
-      MoreArgs = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold,
-        pb = pb), SIMPLIFY = FALSE
-    )
-  } else {
-    lg$debug("Running resample() via future with %i iterations", n)
-
-    res = future.apply::future_mapply(workhorse,
-      iteration = seq_len(n), learner = grid$learner, mode = grid$mode,
-      MoreArgs = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold,
-      pb = pb),
-      SIMPLIFY = FALSE, future.globals = FALSE, future.scheduling = structure(TRUE, ordering = "random"),
-      future.packages = "mlr3", future.seed = TRUE, future.stdout = future_stdout()
-    )
-  }
+  res = future_map(n, workhorse, iteration = seq_len(n), learner = grid$learner, mode = grid$mode,
+    MoreArgs = list(task = task, resampling = resampling, store_models = store_models, lgr_threshold = lgr_threshold, pb = pb)
+  )
 
   data = data.table(
     task = list(task),

diff --git a/R/zzz.R b/R/zzz.R
@@ -41,9 +41,14 @@
 #' * Encapsulated evaluation: \CRANpkg{evaluate}, \CRANpkg{callr} (external process)
 #'
 #' @section Package Options:
+#' * `"mlr3.exec_random"`: Randomize the order of execution in [resample()] and [benchmark()] during
+#'   parallelization with \CRANpkg{future}. Defaults to `TRUE`.
+#'   Note that this does not affect the order of results.
+#' * `"mlr3.exec_chunk_size"`: Number of iterations to perform in a single [future::future()] during
+#'   parallelization with \CRANpkg{future}. Defaults to 1.
 #' * `"mlr3.debug"`: If set to `TRUE`, parallelization via \CRANpkg{future} is disabled to simplify
 #'   debugging and provide more concise tracebacks.
-#'   Note that results computed with debug mode enabled use a different seeding mechanism and are not reproducible.
+#'   Note that results computed in debug mode use a different seeding mechanism and are **not reproducible**.
 #' * `"mlr3.allow_utf8_names"`: If set to `TRUE`, checks on the feature names are relaxed, allowing
 #'   non-ascii characters in column names. This is an experimental and temporal option to
 #'   pave the way for text analysis, and will likely be removed in a future version of the package.

diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd