rcmdcheck passes

mlr-org · Jan 22, 2024 · da9b40d · da9b40d
1 parent 5849806
commit da9b40d
Show file tree

Hide file tree

Showing 42 changed files with 396 additions and 264 deletions.
diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -47,7 +47,6 @@ Imports:
     data.table,
     paradox (>= 0.11.0),
     R6,
-    torchvision,
     withr
 Suggests:
     callr,
@@ -61,6 +60,7 @@ Suggests:
     rmarkdown,
     viridis,
     testthat (>= 3.0.0),
+    torchvision,
     zip
 Remotes:
     r-lib/zip,

diff --git a/R/DataDescriptor.R b/R/DataDescriptor.R
@@ -22,7 +22,7 @@
 #'   Character vector that must have the same length as the input of the graph.
 #'   Specifies how the data from the `dataset` is fed into the preprocessing graph.
 #' @param pointer (`character(2)` | `NULL`)\cr
-#'   Indicating an element on which a model is. Points to an output channel within `graph`:
+#'   Points to an output channel within `graph`:
 #'   Element 1 is the `PipeOp`'s id and element 2 is that `PipeOp`'s output channel.
 #' @param pointer_shape (`integer` | `NULL`)\cr
 #'   Shape of the output indicated by `pointer`.
@@ -111,7 +111,6 @@ DataDescriptor = R6Class("DataDescriptor",
       # compilation
       dataset_hash = calculate_hash(address(dataset))
 
-
       self$dataset = dataset
       self$graph = graph
       self$dataset_shapes = dataset_shapes

diff --git a/R/PipeOpTaskPreprocTorch.R b/R/PipeOpTaskPreprocTorch.R
@@ -28,8 +28,9 @@
 #'
 #' @template param_id
 #' @template param_param_vals
-#' @param fn (`function`)\cr
+#' @param fn (`function` or `character(2)`)\cr
 #'   The preprocessing function. Should not modify its input in-place.
+#'   If it is a `character(2)`, the first element should be the namespace and thje second element the name.
 #' @param packages (`character()`)\cr
 #'   The packages the preprocessing function depends on.
 #' @param param_set ([`ParamSet`])\cr
@@ -178,7 +179,8 @@ PipeOpTaskPreprocTorch = R6Class("PipeOpTaskPreprocTorch",
     #' Creates a new instance of this [`R6`][R6::R6Class] class.
     initialize = function(fn, id = "preproc_torch", param_vals = list(), param_set = ps(), packages = character(0),
       stages_init = "both", rowwise = FALSE) { # nolint
-      private$.fn = assert_function(fn)
+      assert(check_function(fn), check_character(fn, len = 2L))
+      private$.fn = fn
       private$.rowwise = assert_flag(rowwise)
 
       param_set = assert_param_set(param_set$clone(deep = TRUE))
@@ -249,6 +251,9 @@ PipeOpTaskPreprocTorch = R6Class("PipeOpTaskPreprocTorch",
     #' The preprocessing function.
     fn = function(rhs) {
       assert_ro_binding(rhs)
+      if (test_character(private$.fn)) {
+        private$.fn = getFromNamespace(private$.fn[2L], private$.fn[1L])
+      }
       private$.fn
     },
     #' @field rowwise
@@ -296,7 +301,7 @@ PipeOpTaskPreprocTorch = R6Class("PipeOpTaskPreprocTorch",
       param_vals$affect_columns = NULL
       stages = param_vals$stages
       param_vals$stages = NULL
-      trafo = private$.fn
+      trafo = self$fn
 
       fn = if (identical(stages, "both") || stage %in% stages) {
         if (length(param_vals)) {
@@ -355,7 +360,7 @@ PipeOpTaskPreprocTorch = R6Class("PipeOpTaskPreprocTorch",
     .additional_phash_input = function() {
       list(
         self$param_set$ids(), self$packages,
-        formals(private$.fn), body(private$.fn), address(environment(private$.fn))
+        formals(self$fn), body(self$fn), address(environment(self$fn))
       )
     },
     .shapes_out = function(shapes_in, param_vals, task) list(NULL),
@@ -370,7 +375,7 @@ PipeOpTaskPreprocTorch = R6Class("PipeOpTaskPreprocTorch",
 #' @param param_vals (`list()`)\cr
 #'   The parameter values.
 #' @export
-pipeop_preproc_torch = function(id, fn, shapes_out, param_set = NULL, param_vals = list(), packages = character(0),
+pipeop_preproc_torch = function(id, fn, shapes_out = NULL, param_set = NULL, param_vals = list(), packages = character(0),
   rowwise = FALSE, parent_env = parent.frame()) {
   pipeop_preproc_torch_class(
     id = id,
@@ -385,8 +390,6 @@ pipeop_preproc_torch = function(id, fn, shapes_out, param_set = NULL, param_vals
 
 
 create_ps = function(fn) {
-  # TODO: could simplify this as we don't need the expression anymore
-  missing = alist(x = )$x
   fmls = formals(fn)
   param_names = names(fmls)
   # we assume the firs argument is for the tensor
@@ -436,7 +439,7 @@ create_ps = function(fn) {
 #' @export
 #' @returns An [`R6Class`][R6::R6Class] instance inheriting from [`PipeOpTaskPreprocTorch`]
 #' @examples
-#' po_example = pipeop_preproc_torch("preproc_example", function(x, a) x + a)
+#' po_example = pipeop_preproc_torch("preproc_example", function(x, a) x + a, )
 #' po_example
 #' po_example$param_set
 pipeop_preproc_torch_class = function(id, fn, shapes_out, param_set = NULL, packages = character(0),
@@ -445,6 +448,8 @@ pipeop_preproc_torch_class = function(id, fn, shapes_out, param_set = NULL, pack
     check_function(shapes_out, args = c("shapes_in", "param_vals", "task"), null.ok = TRUE),
     check_choice(shapes_out, c("infer", "unchanged"))
   )
+
+  # we e.g. want torchvision in suggests, so we cannot already access the function.
   if (identical(shapes_out, "infer")) {
     shapes_out = crate(function(shapes_in, param_vals, task) {
       sin = shapes_in[[1L]]
@@ -457,7 +462,7 @@ pipeop_preproc_torch_class = function(id, fn, shapes_out, param_set = NULL, pack
         sin = sin[-1L]
       }
       tensor_in = invoke(torch_empty, .args = sin, device = torch_device("meta"))
-      tensor_out = tryCatch(invoke(private$.fn, tensor_in, .args = param_vals),
+      tensor_out = tryCatch(invoke(self$fn, tensor_in, .args = param_vals),
         error = function(e) {
           stopf("Failed to infer output shape, presumably invalid input shape; error message is: %s", e)
         }
@@ -485,9 +490,19 @@ pipeop_preproc_torch_class = function(id, fn, shapes_out, param_set = NULL, pack
   param_set = param_set %??% create_ps(fn)
 
   stages_init = if (startsWith(id, "augment_")) "train" else "both"
+
+  # the .__construction info construct is used to not having to rely on NSE
   init_fun = crate(function(id = id, param_vals = list()) { # nolint
     info = private$.__construction_info
-    param_set = info$param_set$clone(deep = TRUE)
+    fn = info$fn
+    if (is.character(fn)) {
+      fn = getFromNamespace(fn[2L], fn[1L])
+    }
+    if (is.null(info$param_set)) {
+      param_set = create_ps(fn)
+    } else {
+      param_set = info$param_set$clone(deep = TRUE)
+    }
     param_set$values = info$init_params # nolint
     super$initialize(
       id = id,
@@ -535,6 +550,11 @@ pipeop_preproc_torch_class = function(id, fn, shapes_out, param_set = NULL, pack
 }
 
 register_preproc = function(id, fn, param_set = NULL, shapes_out = NULL, packages = character(0), rowwise = FALSE) {
+  fn_call = substitute(fn)
+  if (identical(as.list(fn_call)[[1]], quote(`::`))) {
+    fn = as.character(as.list(fn_call)[-1L])
+  }
+
   Class = pipeop_preproc_torch_class(id, fn, param_set = param_set, shapes_out = shapes_out,
     packages = packages, rowwise = rowwise, parent_env = parent.frame())
   assign(Class$classname, Class, parent.frame())

diff --git a/R/PipeOpTorchIngress.R b/R/PipeOpTorchIngress.R
@@ -291,7 +291,7 @@ register_po("torch_ingress_categ", PipeOpTorchIngressCategorical)
 #' # Now we try a lazy tensor with unknown shape, i.e. the shapes between the rows can differ
 #'
 #' ds = dataset(
-#'   initialize = function() self$x = list(torch_randn(3, 10, 10), torch_randn(10, 8, 8)),
+#'   initialize = function() self$x = list(torch_randn(3, 10, 10), torch_randn(3, 8, 8)),
 #'   .getitem = function(i) list(x = self$x[[i]]),
 #'   .length = function() 2)()
 #'

diff --git a/R/TaskClassif_tiny_imagenet.R b/R/TaskClassif_tiny_imagenet.R
@@ -86,6 +86,7 @@ load_task_tiny_imagenet = function(id = "tiny_imagenet") {
     withr::with_locale(c(LC_COLLATE = "C"), {
       dt = cached(constructor_tiny_imagenet, "datasets", "tiny_imagenet")$data
     })
+
     dt$image = as_lazy_tensor(dataset_image(dt$image), dataset_shapes = list(x = c(NA, 3, 64, 64)))
     dt$..row_id = seq_len(nrow(dt))
     DataBackendDataTable$new(data = dt, primary_key = "..row_id")

diff --git a/R/lazy_tensor.R b/R/lazy_tensor.R
@@ -43,7 +43,7 @@ new_lazy_tensor = function(data_descriptor, ids) {
 
 #' @export
 `[[.lazy_tensor` = function(x, i) {
-  structure(unclass(x)[[i]], class = c("lazy_tensor", "list"))
+  unclass(x)[[i]]
 }
 
 #' @export
@@ -74,7 +74,7 @@ c.lazy_tensor = function(...) {
   if (!all(map_lgl(dots, is_lazy_tensor))) {
     return(NextMethod())
   }
-  if (length(unique(map_chr(dots, function(x) dd(x)$hash))) > 1) {
+  if (length(unique(map_chr(dots[lengths(dots) != 0], function(x) dd(x)$hash))) > 1) {
     stopf("Can only concatenate lazy tensors with the same data descriptors.")
   }
 

diff --git a/R/materialize.R b/R/materialize.R
@@ -26,7 +26,7 @@
 #'   Either a [`lazy_tensor`] or a `list()` / `data.frame()` containing [`lazy_tensor`] columns.
 #' @param rbind (`logical(1)`)\cr
 #'   Whether to rbind the lazy tensor columns (`TRUE`) or return them as a list of tensors (`FALSE`).
-#'   In the second case, the batch dimension is present for all individual tensors.
+#'   In the second case, there is no batch dimension.
 #' @return (`list()` of [`lazy_tensor`]s or a [`lazy_tensor`])
 #' @param device (`character(1)`)\cr
 #'   The torch device.
@@ -87,6 +87,44 @@ materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # no
   materialize_internal(x = x, device = device, cache = NULL, rbind = rbind)
 }
 
+get_input = function(ds, ids, varying_shapes, rbind) {
+  if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined
+    x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1)))
+    if (varying_shapes) {
+      x
+    } else {
+      map(transpose_list(x), function(x) torch_cat(x, dim = 1L))
+    }
+  } else {
+    ds$.getbatch(ids)
+  }
+}
+
+get_output = function(input, graph, varying_shapes, rbind, device) {
+  output = if (varying_shapes) {
+    # list --graph--> (list or tensor)
+    transpose_list(map(input, function(x) graph$train(x, single_input = FALSE)))
+  } else {
+    # tensor --graph--> tensor
+    graph$train(input, single_input = FALSE)
+  }
+
+  # now we get it in the right output format and convert it to the requested device
+  output = if (rbind) {
+    if (varying_shapes) { # need to convert from list of tensors to tensor
+      output = map(output, list_to_batch)
+    }
+    map(output, function(x) x$to(device = device))
+  } else {
+    if (!varying_shapes) { # need to convert from tensor to list of tensors
+      output = map(output, function(x) torch_split(x, split_size = 1L, dim = 1L))
+    }
+    map(output, function(out) map(out, function(o) o$squeeze(1)$to(device = device)))
+  }
+
+  return(output)
+}
+
 #' @title Materialize a Lazy Tensor
 #' @description
 #' Convert a [`lazy_tensor()`] to a [`torch_tensor()`].
@@ -142,25 +180,11 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {
 
     if (input_hit) {
       input = cache[[input_hash]]
-      input_hit = TRUE
     }
   }
 
   if (!do_caching || !input_hit) {
-    input = if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined
-      x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1)))
-      if (varying_shapes || !rbind) {
-        x
-      } else {
-        map(transpose_list(x), function(x) torch_cat(x, dim = 1L))
-      }
-    } else {
-      if (rbind) {
-        ds$.getbatch(ids)
-      } else {
-        map(ids, function(id) ds$.getbatch(id))
-      }
-    }
+    input = get_input(ds, ids, varying_shapes, rbind)
   }
 
   if (do_caching && !input_hit) {
@@ -170,42 +194,20 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) {
   # input is the output of a dataset so it can contain more than what we need for the graph,
   # also we need to set the correct names.
   # This is done after retrieving the element from the cache / before saving the element to the cache because
-  # this can change
-
-  input = if (rbind && !varying_shapes) {
-    set_names(input[data_descriptor$input_map], data_descriptor$graph_input)
-  } else {
+  # this can change depending on the preprocessing graph
+  input = if (varying_shapes) {
     map(input, function(x) {
       set_names(x[data_descriptor$input_map], data_descriptor$graph_input)
     })
-  }
-
-  output = if (rbind && !varying_shapes) {
-    # tensor --graph--> tensor
-    graph$train(input, single_input = FALSE)
   } else {
-    # list --graph--> (list or tensor)
-    out = map(input, function(x) graph$train(x, single_input = FALSE))
-
-    if (rbind) {
-      # here, is a list with hierarchy: [id = [po_id = [ch_nm = ]]]
-      # We want to obtain a list [po_id = [ch_nm = [...]]] where the [...] is the rbind over all ids
-      rows = seq_along(out)
-      out = map(names(out[[1L]]), function(name) torch_cat(map(out[rows], name)))
-    }
-    out
+    set_names(input[data_descriptor$input_map], data_descriptor$graph_input)
   }
 
+  output = get_output(input, graph, varying_shapes, rbind, device)
+
   if (do_caching) {
     cache[[output_hash]] = output
   }
 
-  # put the tensor on the required device
-  if (rbind) {
-    res = output[[pointer_name]]$to(device = device)
-  } else {
-    res = map(output, function(o) o[[pointer_name]]$to(device = device))
-  }
-
-  return(res)
+  output[[pointer_name]]
 }
diff --git a/R/nn_graph.R b/R/nn_graph.R
@@ -16,20 +16,26 @@
 #' @family Graph Network
 #' @export
 #' @examples
-#' graph = po("module_1", module = nn_linear(10, 20)) %>>%
-#'   po("module_2", module = nn_relu()) %>>%
-#'   po("module_3", module = nn_linear(20, 1))
+#' graph = mlr3pipelines::Graph$new()
+#' graph$add_pipeop(po("module_1", module = nn_linear(10, 20)), clone = FALSE)
+#' graph$add_pipeop(po("module_2", module = nn_relu()), clone = FALSE)
+#' graph$add_pipeop(po("module_3", module = nn_linear(20, 1)), clone = FALSE)
+#' graph$add_edge("module_1", "module_2")
+#' graph$add_edge("module_2", "module_3")
+#'
 #' network = nn_graph(graph, shapes_in = list(module_1.input = c(NA, 10)))
-#' network
+#'
 #' x = torch_randn(16, 10)
+#'
 #' network(module_1.input = x)
 nn_graph = nn_module(
   "nn_graph",
   initialize = function(graph, shapes_in, output_map = graph$output$name, list_output = FALSE) {
-    self$graph = as_graph(graph)
+    self$graph = as_graph(graph, clone = FALSE)
     self$graph_input_name = graph$input$name  # cache this, it is expensive
 
     # we do NOT verify the input and type of the graph to be `"torch_tensor"`.
+
     # The reason for this is that the graph, when constructed with the PipeOpTorch Machinery, contains PipeOpNOPs,
     # which have input and output type *.