h2oai · maurever · Dec 11, 2023 · Nov 22, 2023 · Dec 4, 2023 · Dec 5, 2023
diff --git a/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java b/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java
@@ -53,7 +53,7 @@ public static final class UpliftDRFParametersV3 extends SharedTreeV3.SharedTreeP
                 is_mutually_exclusive_with = {"ignored_columns","response_column", "weights_column"})
         public String treatment_column;
 
-        @API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"})
+        @API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"}, gridable = true)
         public UpliftDRFParameters.UpliftMetricType uplift_metric;
 
         @API(help = "Metric used to calculate Area Under Uplift Curve.", level = API.Level.secondary, values = { "AUTO", "qini", "lift", "gain"})

diff --git a/h2o-algos/src/test/java/hex/grid/GridTest.java b/h2o-algos/src/test/java/hex/grid/GridTest.java
@@ -9,6 +9,7 @@
 import hex.grid.HyperSpaceWalker.BaseWalker.WalkerFactory;
 import hex.tree.CompressedTree;
 import hex.tree.gbm.GBMModel;
+import hex.tree.uplift.UpliftDRFModel;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Rule;
@@ -1026,4 +1027,39 @@ public void test_parallel_grid_cancelled_on_consecutive_model_failures() {
 
   }
 
+  @Test
+  public void testUpliftDrfGridSearch() {
+    try {
+      Scope.enter();
+
+      Frame train = Scope.track(parseTestFile("smalldata/uplift/criteo_uplift_13k.csv"));
+      train.toCategoricalCol("treatment");
+      train.toCategoricalCol("conversion");
+      DKV.put(train);
+      Scope.track(train);
+
+      HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {{
+        put("_ntrees", new Integer[]{5, 10, 20});
+        put("_max_depth", new Integer[]{5, 10});
+      }};
+
+      UpliftDRFModel.UpliftDRFParameters params = new UpliftDRFModel.UpliftDRFParameters();
+      params._train = train._key;
+      params._ignored_columns = new String[]{"visit", "exposure"};
+      params._treatment_column = "treatment";
+      params._response_column = "conversion";
+      params._seed = 0xDECAF;
+
+      Job<Grid> gs = GridSearch.create(null, params, hyperParms).withMaxConsecutiveFailures(10)
+              .withParallelism(3)
+              .start();
+      Scope.track_generic(gs);
+      final Grid grid = gs.get();
+      Scope.track_generic(grid);
+      assertEquals(6, grid.getModelCount());
+    } finally {
+      Scope.exit();
+    }
+  }
+
 }
diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java
@@ -309,7 +309,9 @@ public boolean haveMojo() {
   public GridSortBy getDefaultGridSortBy() {
     if (! isSupervised())
       return null;
-    else if (_output.nclasses() > 1)
+    else if (_output.hasTreatment()){
+      return GridSortBy.AUUC;
+    } else if (_output.nclasses() > 1)
       return GridSortBy.LOGLOSS;
     else
       return GridSortBy.RESDEV;
@@ -319,6 +321,7 @@ public static class GridSortBy { // intentionally not an enum to allow 3rd party
     public static final GridSortBy LOGLOSS = new GridSortBy("logloss", false);
     public static final GridSortBy RESDEV = new GridSortBy("residual_deviance", false);
     public static final GridSortBy R2 = new GridSortBy("r2", true);
+    public static final GridSortBy AUUC = new GridSortBy("auuc", false);
 
     public final String _name;
     public final boolean _decreasing;

diff --git a/h2o-py/h2o/grid/grid_search.py b/h2o-py/h2o/grid/grid_search.py
@@ -377,11 +377,15 @@ def build_model(self, algo_params):
         training_frame = algo_params.pop("training_frame")
         validation_frame = algo_params.pop("validation_frame", None)
         is_auto_encoder = (algo_params is not None) and ("autoencoder" in algo_params and algo_params["autoencoder"])
+        is_uplift = (algo_params is not None) and ("treatment_column" in algo_params and algo_params["treatment_column"])
         if is_auto_encoder and y is not None:
             raise ValueError("y should not be specified for autoencoder.")
         if self.model.supervised_learning:
             if y is None:
                 raise ValueError("Missing response")
+            elif is_uplift:
+                y = y if y in training_frame.names else training_frame.names[y]
+                self.model._estimator_type = "binomial_uplift"
             else:
                 y = y if y in training_frame.names else training_frame.names[y]
                 self.model._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
@@ -396,11 +400,12 @@ def _model_build(self, x, y, tframe, vframe, kwargs):
         if y is not None: kwargs['response_column'] = y
         if not is_type(x, list, tuple): x = [x]
         if is_type(x[0], int):
-            x = [tframe.names[i] for i in x]
+            x = [tframe.names[i] for i in x] 
         offset = kwargs["offset_column"]
         folds = kwargs["fold_column"]
         weights = kwargs["weights_column"]
-        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
+        treatment = kwargs["treatment_column"] if "treatment_column" in kwargs else None
+        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights, treatment]))
         kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
         kwargs = {k: H2OEstimator._keyify(kwargs[k]) for k in kwargs}
         if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
@@ -1482,6 +1487,8 @@ def _metrics_class(model_json):
             model_class = H2ODimReductionGridSearch
         elif model_type == "AnomalyDetection":
             model_class = H2OBinomialGridSearch
+        elif model_type == "BinomialUplift":
+            model_class = H2OBinomialGridSearch
         else:
             raise NotImplementedError(model_type)
         return model_class

diff --git a/h2o-py/h2o/grid/metrics.py b/h2o-py/h2o/grid/metrics.py
@@ -1271,3 +1271,27 @@ def mean_per_class_error(self, train=False, valid=False, xval=False):
 
 class H2ORegressionGridSearch(object):
     pass
+
+#-----------------------------------------------------------------------------------------------------------------------
+# Binomial Uplift Grid Search
+#-----------------------------------------------------------------------------------------------------------------------
+
+
+class H2OOBinomialUpliftGridSearch(object):
+
+    def auuc(self, train=False, valid=False):
+        return {model.model_id: model.auuc(train, valid) for model in self.models}
+
+    def qini(self, train=False, valid=False):
+        return {model.model_id: model.qini(train, valid) for model in self.models}
+
+    def ate(self, train=False, valid=False):
+        return {model.model_id: model.ate(train, valid) for model in self.models}
+
+    def att(self, train=False, valid=False):
+        return {model.model_id: model.att(train, valid) for model in self.models}
+
+    def atc(self, train=False, valid=False):
+        return {model.model_id: model.atc(train, valid) for model in self.models}
+
+
diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py
@@ -734,19 +734,20 @@ def make_random_grid_space(algo, ncols=None, nrows=None):
     :return: a dictionary of parameter_name:list_of_values
     """
     grid_space = {}
-    if algo in ["gbm", "rf"]:
+    if algo in ["gbm", "rf", "uplift"]:
         if random.randint(0,1): grid_space['ntrees'] = random.sample(list(range(1,6)),random.randint(2,3))
         if random.randint(0,1): grid_space['max_depth'] = random.sample(list(range(1,6)),random.randint(2,3))
         if random.randint(0,1): grid_space['min_rows'] = random.sample(list(range(1,11)),random.randint(2,3))
         if random.randint(0,1): grid_space['nbins'] = random.sample(list(range(2,21)),random.randint(2,3))
         if random.randint(0,1): grid_space['nbins_cats'] = random.sample(list(range(2,1025)),random.randint(2,3))
-
         if algo == "gbm":
             if random.randint(0,1): grid_space['learn_rate'] = [random.random() for _ in range(random.randint(2,3))]
             grid_space['distribution'] = random.sample(['bernoulli', 'multinomial', 'gaussian', 'poisson', 'tweedie', 'gamma'], 1)
         if algo == "rf":
             if random.randint(0,1): grid_space['mtries'] = random.sample(list(range(1,ncols+1)),random.randint(2,3))
             if random.randint(0,1): grid_space['sample_rate'] = [random.random() for r in range(random.randint(2,3))]
+        if algo == "uplift":
+            grid_space['uplift_metric'] = random.sample(['KL','ChiSquared','Euclidean'], 1)
     elif algo == "km":
         grid_space['k'] = random.sample(list(range(1,10)),random.randint(2,3))
         if random.randint(0,1): grid_space['max_iterations'] = random.sample(list(range(1,1000)),random.randint(2,3))

diff --git a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py
@@ -0,0 +1,76 @@
+from __future__ import print_function
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+import copy
+from h2o.estimators.uplift_random_forest import H2OUpliftRandomForestEstimator
+from h2o.grid.grid_search import H2OGridSearch
+
+
+def grid_uplift_drf():
+
+    data = h2o.import_file(path=pyunit_utils.locate("smalldata/uplift/upliftml_train.csv"))
+    r = data[0].runif(seed=42)
+    train = data[r > .2]
+    valid = data[r <= .2]
+
+    grid_space = pyunit_utils.make_random_grid_space(algo="uplift")
+    print("Grid space: {0}".format(grid_space))
+
+    predictors = ["feature_"+str(x) for x in range(1,13)]
+    response_col = "outcome"
+    treatment_col = "treatment"
+    true_model_type = "binomial_uplift"
+
+    print("Predictors: {0}".format(predictors))
+    print("Response: {0}".format(response_col))
+
+    train[response_col] = train[response_col].asfactor()
+    valid[response_col] = valid[response_col].asfactor()
+    train[treatment_col] = train[treatment_col].asfactor()
+    valid[treatment_col] = valid[treatment_col].asfactor()
+
+    print("Constructing the grid of uplift drf models...")
+    uplift_grid = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=grid_space)
+    uplift_grid.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train, 
+                      validation_frame=valid)
+
+    print("Check correct type value....")
+    model_type = uplift_grid[0].type
+    assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}."\
+        .format(model_type, true_model_type)
+
+    print("Performing various checks of the constructed grid...")
+
+    print("Check cardinality of grid, that is, the correct number of models have been created...")
+    size_of_grid_space = 1
+    for v in list(grid_space.values()):
+        size_of_grid_space = size_of_grid_space * len(v)
+    actual_size = len(uplift_grid)
+    assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}".format(
+        size_of_grid_space, actual_size)
+
+    print("Duplicate-entries-in-grid-space check")
+    new_grid_space = copy.deepcopy(grid_space)
+    for name in list(grid_space.keys()):
+        if not name == "distribution":
+            new_grid_space[name] = grid_space[name] + grid_space[name]
+    print("The new search space: {0}".format(new_grid_space))
+    print("Constructing the new grid of gbm models...")
+    uplift_grid2 = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=new_grid_space)
+    uplift_grid2.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train, 
+                       validation_frame=valid)
+    actual_size2 = len(uplift_grid2)
+    assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
+                                        "size: {1}".format(actual_size, actual_size2)
+
+    print("Check that the hyper_params that were passed to grid, were used to construct the models...")
+    for name in list(grid_space.keys()):
+        pyunit_utils.expect_model_param(uplift_grid, name, grid_space[name])
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(grid_uplift_drf)
+else:
+    grid_uplift_drf()
diff --git a/h2o-r/h2o-package/R/grid.R b/h2o-r/h2o-package/R/grid.R
@@ -108,6 +108,11 @@ h2o.grid <- function(algorithm,
       dots$x <- x
     }
   }
+  if(algorithm %in% c("upliftdrf")){
+      if(is.null(dots$treatment_column)) {
+        stop("Must specify treatment column")
+      }
+  }  
   algorithm <- .h2o.unifyAlgoName(algorithm)
   model_param_names <- names(dots)
   hyper_param_names <- names(hyper_params)

diff --git a/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R
@@ -0,0 +1,65 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+library(uplift)
+
+check.uplift.grid <- function() {
+    data <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
+    print(summary(data))
+
+    data$treat <- ifelse(data$treat == 1, 1, 0)
+    data$treat <- as.factor(data$treat)
+    data$y <- as.factor(data$y)
+    data <- as.h2o(data)
+
+    x <- c("X1", "X2", "X3", "X4", "X5", "X6")
+    y <- "y"
+    treat <- "treat"
+
+    pretty.list <- function(ll) {
+        str <- lapply(ll, function(x) { paste0("(", paste(x, collapse = ","), ")", sep = "") })
+        paste(str, collapse = ",")
+    }
+    ntrees_opts <- c(1, 5)
+    max_depth_opts <- c(2, 5)
+    uplift_metric_opts <- c("KL", "Euclidean", "ChiSquared")
+    size_of_hyper_space <- length(ntrees_opts) * length(max_depth_opts) * length(uplift_metric_opts)
+
+    hyper_parameters <- list(ntrees=ntrees_opts, max_depth=max_depth_opts, uplift_metric=uplift_metric_opts)
+    Log.info(paste("UpliftDRF grid with the following hyper_parameters:", pretty.list(hyper_parameters)))
+    gg <- h2o.grid("upliftdrf", grid_id="upliftdrf_grid_test", x=x, y=y, training_frame=data, treatment_column=treat, hyper_params=hyper_parameters)
+
+    # Get models
+    gg_models <- lapply(gg@model_ids, function(mid) {
+        model <- h2o.getModel(mid)
+    })
+    # Check expected number of models
+    print(paste(length(gg@model_ids), "==", size_of_hyper_space))
+    expect_equal(length(gg_models), size_of_hyper_space)
+
+    # Check parameters coverage
+    # ntrees
+    expect_model_param(gg_models, "ntrees", ntrees_opts)
+
+    # Learn rate
+    expect_model_param(gg_models, "max_depth", max_depth_opts)
+
+    # uplift metric
+    expect_model_param(gg_models, "uplift_metric", uplift_metric_opts)
+
+    cat("\n\n Grid search results:")
+    print(gg)
+
+    # Test grid sorting
+    ascending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=FALSE)
+    descending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=TRUE)
+
+    ascending_model_ids <- ascending@model_ids
+    descending_model_ids <- descending@model_ids
+
+    expect_equal(length(ascending_model_ids), length(descending_model_ids))
+    expect_equal(length(ascending_model_ids), size_of_hyper_space)
+    expect_equal(rev(ascending_model_ids), descending_model_ids)
+}
+
+doTest("UpliftDRF Grid Search: iteration over parameters", check.uplift.grid)
+