From ee6a03a66de610ff15be68072da8014666e6b556 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 22 Nov 2023 15:53:42 +0100 Subject: [PATCH 1/5] Add grid search junit test, edit gridable parameters --- .../main/java/hex/schemas/UpliftDRFV3.java | 2 +- .../src/test/java/hex/grid/GridTest.java | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java b/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java index a711e0c2e7c1..c72ab8b8675b 100644 --- a/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java +++ b/h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java @@ -53,7 +53,7 @@ public static final class UpliftDRFParametersV3 extends SharedTreeV3.SharedTreeP is_mutually_exclusive_with = {"ignored_columns","response_column", "weights_column"}) public String treatment_column; - @API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"}) + @API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"}, gridable = true) public UpliftDRFParameters.UpliftMetricType uplift_metric; @API(help = "Metric used to calculate Area Under Uplift Curve.", level = API.Level.secondary, values = { "AUTO", "qini", "lift", "gain"}) diff --git a/h2o-algos/src/test/java/hex/grid/GridTest.java b/h2o-algos/src/test/java/hex/grid/GridTest.java index 10dbf7e1aa19..21c58a7f07bd 100644 --- a/h2o-algos/src/test/java/hex/grid/GridTest.java +++ b/h2o-algos/src/test/java/hex/grid/GridTest.java @@ -9,6 +9,7 @@ import hex.grid.HyperSpaceWalker.BaseWalker.WalkerFactory; import hex.tree.CompressedTree; import hex.tree.gbm.GBMModel; +import hex.tree.uplift.UpliftDRFModel; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; @@ -1026,4 +1027,39 @@ public void test_parallel_grid_cancelled_on_consecutive_model_failures() { } + @Test + public void testUpliftDrfGridSearch() { + try { + Scope.enter(); + + Frame train = Scope.track(parseTestFile("smalldata/uplift/criteo_uplift_13k.csv")); + train.toCategoricalCol("treatment"); + train.toCategoricalCol("conversion"); + DKV.put(train); + Scope.track(train); + + HashMap hyperParms = new HashMap() {{ + put("_ntrees", new Integer[]{5, 10, 20}); + put("_max_depth", new Integer[]{5, 10}); + }}; + + UpliftDRFModel.UpliftDRFParameters params = new UpliftDRFModel.UpliftDRFParameters(); + params._train = train._key; + params._ignored_columns = new String[]{"visit", "exposure"}; + params._treatment_column = "treatment"; + params._response_column = "conversion"; + params._seed = 0xDECAF; + + Job gs = GridSearch.create(null, params, hyperParms).withMaxConsecutiveFailures(10) + .withParallelism(3) + .start(); + Scope.track_generic(gs); + final Grid grid = gs.get(); + Scope.track_generic(grid); + assertEquals(6, grid.getModelCount()); + } finally { + Scope.exit(); + } + } + } From 395efdfd11637b746287764fdaba1a47f53dfdd2 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Mon, 4 Dec 2023 14:52:39 +0100 Subject: [PATCH 2/5] Implement python test --- h2o-core/src/main/java/hex/Model.java | 5 +- h2o-py/h2o/grid/grid_search.py | 11 ++- h2o-py/h2o/grid/metrics.py | 24 ++++++ h2o-py/tests/pyunit_utils/utilsPY.py | 5 +- .../uplift/pyunit_uplift_rf_grid.py | 76 +++++++++++++++++++ 5 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 2123d5096b8d..11ddc13a921f 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -309,7 +309,9 @@ public boolean haveMojo() { public GridSortBy getDefaultGridSortBy() { if (! isSupervised()) return null; - else if (_output.nclasses() > 1) + else if (_output.hasTreatment()){ + return GridSortBy.AUUC; + } else if (_output.nclasses() > 1) return GridSortBy.LOGLOSS; else return GridSortBy.RESDEV; @@ -319,6 +321,7 @@ public static class GridSortBy { // intentionally not an enum to allow 3rd party public static final GridSortBy LOGLOSS = new GridSortBy("logloss", false); public static final GridSortBy RESDEV = new GridSortBy("residual_deviance", false); public static final GridSortBy R2 = new GridSortBy("r2", true); + public static final GridSortBy AUUC = new GridSortBy("auuc", false); public final String _name; public final boolean _decreasing; diff --git a/h2o-py/h2o/grid/grid_search.py b/h2o-py/h2o/grid/grid_search.py index 2b3f72d1f985..657a85a3859c 100644 --- a/h2o-py/h2o/grid/grid_search.py +++ b/h2o-py/h2o/grid/grid_search.py @@ -377,11 +377,15 @@ def build_model(self, algo_params): training_frame = algo_params.pop("training_frame") validation_frame = algo_params.pop("validation_frame", None) is_auto_encoder = (algo_params is not None) and ("autoencoder" in algo_params and algo_params["autoencoder"]) + is_uplift = (algo_params is not None) and ("treatment_column" in algo_params and algo_params["treatment_column"]) if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if self.model.supervised_learning: if y is None: raise ValueError("Missing response") + elif is_uplift: + y = y if y in training_frame.names else training_frame.names[y] + self.model._estimator_type = "binomial_uplift" else: y = y if y in training_frame.names else training_frame.names[y] self.model._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" @@ -396,11 +400,12 @@ def _model_build(self, x, y, tframe, vframe, kwargs): if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): - x = [tframe.names[i] for i in x] + x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] - ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) + treatment = kwargs["treatment_column"] + ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights, treatment])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = {k: H2OEstimator._keyify(kwargs[k]) for k in kwargs} if self.grid_id is not None: kwargs["grid_id"] = self.grid_id @@ -1482,6 +1487,8 @@ def _metrics_class(model_json): model_class = H2ODimReductionGridSearch elif model_type == "AnomalyDetection": model_class = H2OBinomialGridSearch + elif model_type == "BinomialUplift": + model_class = H2OBinomialGridSearch else: raise NotImplementedError(model_type) return model_class diff --git a/h2o-py/h2o/grid/metrics.py b/h2o-py/h2o/grid/metrics.py index 2daefb9412bd..5aabf082ae4c 100644 --- a/h2o-py/h2o/grid/metrics.py +++ b/h2o-py/h2o/grid/metrics.py @@ -1271,3 +1271,27 @@ def mean_per_class_error(self, train=False, valid=False, xval=False): class H2ORegressionGridSearch(object): pass + +#----------------------------------------------------------------------------------------------------------------------- +# Binomial Uplift Grid Search +#----------------------------------------------------------------------------------------------------------------------- + + +class H2OOBinomialUpliftGridSearch(object): + + def auuc(self, train=False, valid=False): + return {model.model_id: model.auuc(train, valid) for model in self.models} + + def qini(self, train=False, valid=False): + return {model.model_id: model.qini(train, valid) for model in self.models} + + def ate(self, train=False, valid=False): + return {model.model_id: model.ate(train, valid) for model in self.models} + + def att(self, train=False, valid=False): + return {model.model_id: model.att(train, valid) for model in self.models} + + def atc(self, train=False, valid=False): + return {model.model_id: model.atc(train, valid) for model in self.models} + + diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py index 8254bd0e1a2a..ce2a5b0ea97b 100644 --- a/h2o-py/tests/pyunit_utils/utilsPY.py +++ b/h2o-py/tests/pyunit_utils/utilsPY.py @@ -734,19 +734,20 @@ def make_random_grid_space(algo, ncols=None, nrows=None): :return: a dictionary of parameter_name:list_of_values """ grid_space = {} - if algo in ["gbm", "rf"]: + if algo in ["gbm", "rf", "uplift"]: if random.randint(0,1): grid_space['ntrees'] = random.sample(list(range(1,6)),random.randint(2,3)) if random.randint(0,1): grid_space['max_depth'] = random.sample(list(range(1,6)),random.randint(2,3)) if random.randint(0,1): grid_space['min_rows'] = random.sample(list(range(1,11)),random.randint(2,3)) if random.randint(0,1): grid_space['nbins'] = random.sample(list(range(2,21)),random.randint(2,3)) if random.randint(0,1): grid_space['nbins_cats'] = random.sample(list(range(2,1025)),random.randint(2,3)) - if algo == "gbm": if random.randint(0,1): grid_space['learn_rate'] = [random.random() for _ in range(random.randint(2,3))] grid_space['distribution'] = random.sample(['bernoulli', 'multinomial', 'gaussian', 'poisson', 'tweedie', 'gamma'], 1) if algo == "rf": if random.randint(0,1): grid_space['mtries'] = random.sample(list(range(1,ncols+1)),random.randint(2,3)) if random.randint(0,1): grid_space['sample_rate'] = [random.random() for r in range(random.randint(2,3))] + if algo == "uplift": + grid_space['uplift_metric'] = random.sample(['KL','ChiSquared','Euclidean'], 1) elif algo == "km": grid_space['k'] = random.sample(list(range(1,10)),random.randint(2,3)) if random.randint(0,1): grid_space['max_iterations'] = random.sample(list(range(1,1000)),random.randint(2,3)) diff --git a/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py new file mode 100644 index 000000000000..742e087882cc --- /dev/null +++ b/h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py @@ -0,0 +1,76 @@ +from __future__ import print_function +import sys +sys.path.insert(1,"../../../") +import h2o +from tests import pyunit_utils +import copy +from h2o.estimators.uplift_random_forest import H2OUpliftRandomForestEstimator +from h2o.grid.grid_search import H2OGridSearch + + +def grid_uplift_drf(): + + data = h2o.import_file(path=pyunit_utils.locate("smalldata/uplift/upliftml_train.csv")) + r = data[0].runif(seed=42) + train = data[r > .2] + valid = data[r <= .2] + + grid_space = pyunit_utils.make_random_grid_space(algo="uplift") + print("Grid space: {0}".format(grid_space)) + + predictors = ["feature_"+str(x) for x in range(1,13)] + response_col = "outcome" + treatment_col = "treatment" + true_model_type = "binomial_uplift" + + print("Predictors: {0}".format(predictors)) + print("Response: {0}".format(response_col)) + + train[response_col] = train[response_col].asfactor() + valid[response_col] = valid[response_col].asfactor() + train[treatment_col] = train[treatment_col].asfactor() + valid[treatment_col] = valid[treatment_col].asfactor() + + print("Constructing the grid of uplift drf models...") + uplift_grid = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=grid_space) + uplift_grid.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train, + validation_frame=valid) + + print("Check correct type value....") + model_type = uplift_grid[0].type + assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}."\ + .format(model_type, true_model_type) + + print("Performing various checks of the constructed grid...") + + print("Check cardinality of grid, that is, the correct number of models have been created...") + size_of_grid_space = 1 + for v in list(grid_space.values()): + size_of_grid_space = size_of_grid_space * len(v) + actual_size = len(uplift_grid) + assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}".format( + size_of_grid_space, actual_size) + + print("Duplicate-entries-in-grid-space check") + new_grid_space = copy.deepcopy(grid_space) + for name in list(grid_space.keys()): + if not name == "distribution": + new_grid_space[name] = grid_space[name] + grid_space[name] + print("The new search space: {0}".format(new_grid_space)) + print("Constructing the new grid of gbm models...") + uplift_grid2 = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=new_grid_space) + uplift_grid2.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train, + validation_frame=valid) + actual_size2 = len(uplift_grid2) + assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \ + "size: {1}".format(actual_size, actual_size2) + + print("Check that the hyper_params that were passed to grid, were used to construct the models...") + for name in list(grid_space.keys()): + pyunit_utils.expect_model_param(uplift_grid, name, grid_space[name]) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(grid_uplift_drf) +else: + grid_uplift_drf() From 74941e74ffcc26891069ab4bd58da4ec6b9bd180 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Tue, 5 Dec 2023 13:40:08 +0100 Subject: [PATCH 3/5] Implement R test --- h2o-r/h2o-package/R/grid.R | 8 +++ .../uplift/runit_uplift_rf_grid.R | 65 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R diff --git a/h2o-r/h2o-package/R/grid.R b/h2o-r/h2o-package/R/grid.R index 9c56ee526669..f90cc0b28236 100644 --- a/h2o-r/h2o-package/R/grid.R +++ b/h2o-r/h2o-package/R/grid.R @@ -64,6 +64,7 @@ h2o.grid <- function(algorithm, x, y, training_frame, + treatment_column = NULL, ..., hyper_params = list(), is_supervised = NULL, @@ -108,6 +109,13 @@ h2o.grid <- function(algorithm, dots$x <- x } } + if(algorithm %in% c("upliftdrf")){ + if(missing(treatment_column)) { + stop("Must specify treatment column") + } else { + dots$treatment_column <- treatment_column + } + } algorithm <- .h2o.unifyAlgoName(algorithm) model_param_names <- names(dots) hyper_param_names <- names(hyper_params) diff --git a/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R new file mode 100644 index 000000000000..c5ad362e5ee8 --- /dev/null +++ b/h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R @@ -0,0 +1,65 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") +library(uplift) + +check.uplift.grid <- function() { + data <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4) + print(summary(data)) + + data$treat <- ifelse(data$treat == 1, 1, 0) + data$treat <- as.factor(data$treat) + data$y <- as.factor(data$y) + data <- as.h2o(data) + + x <- c("X1", "X2", "X3", "X4", "X5", "X6") + y <- "y" + treat <- "treat" + + pretty.list <- function(ll) { + str <- lapply(ll, function(x) { paste0("(", paste(x, collapse = ","), ")", sep = "") }) + paste(str, collapse = ",") + } + ntrees_opts <- c(1, 5) + max_depth_opts <- c(2, 5) + uplift_metric_opts <- c("KL", "Euclidean", "ChiSquared") + size_of_hyper_space <- length(ntrees_opts) * length(max_depth_opts) * length(uplift_metric_opts) + + hyper_parameters <- list(ntrees=ntrees_opts, max_depth=max_depth_opts, uplift_metric=uplift_metric_opts) + Log.info(paste("UpliftDRF grid with the following hyper_parameters:", pretty.list(hyper_parameters))) + gg <- h2o.grid("upliftdrf", grid_id="upliftdrf_grid_test", x=x, y=y, training_frame=data, treatment_column=treat, hyper_params=hyper_parameters) + + # Get models + gg_models <- lapply(gg@model_ids, function(mid) { + model <- h2o.getModel(mid) + }) + # Check expected number of models + print(paste(length(gg@model_ids), "==", size_of_hyper_space)) + expect_equal(length(gg_models), size_of_hyper_space) + + # Check parameters coverage + # ntrees + expect_model_param(gg_models, "ntrees", ntrees_opts) + + # Learn rate + expect_model_param(gg_models, "max_depth", max_depth_opts) + + # uplift metric + expect_model_param(gg_models, "uplift_metric", uplift_metric_opts) + + cat("\n\n Grid search results:") + print(gg) + + # Test grid sorting + ascending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=FALSE) + descending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=TRUE) + + ascending_model_ids <- ascending@model_ids + descending_model_ids <- descending@model_ids + + expect_equal(length(ascending_model_ids), length(descending_model_ids)) + expect_equal(length(ascending_model_ids), size_of_hyper_space) + expect_equal(rev(ascending_model_ids), descending_model_ids) +} + +doTest("UpliftDRF Grid Search: iteration over parameters", check.uplift.grid) + From 11b5117b8baf42cb1bd19e66e5ac0c9ad518ade9 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Tue, 5 Dec 2023 16:23:56 +0100 Subject: [PATCH 4/5] fix python grid --- h2o-py/h2o/grid/grid_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-py/h2o/grid/grid_search.py b/h2o-py/h2o/grid/grid_search.py index 657a85a3859c..2dadc3c2c421 100644 --- a/h2o-py/h2o/grid/grid_search.py +++ b/h2o-py/h2o/grid/grid_search.py @@ -404,7 +404,7 @@ def _model_build(self, x, y, tframe, vframe, kwargs): offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] - treatment = kwargs["treatment_column"] + treatment = kwargs["treatment_column"] if "treatment_column" in kwargs else None ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights, treatment])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = {k: H2OEstimator._keyify(kwargs[k]) for k in kwargs} From 82bf4e82fca41edddd169d535853501a0acbd8e6 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 6 Dec 2023 10:23:11 +0100 Subject: [PATCH 5/5] Fix treatment_column parameter --- h2o-r/h2o-package/R/grid.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/h2o-r/h2o-package/R/grid.R b/h2o-r/h2o-package/R/grid.R index f90cc0b28236..de8ba03a4ff0 100644 --- a/h2o-r/h2o-package/R/grid.R +++ b/h2o-r/h2o-package/R/grid.R @@ -64,7 +64,6 @@ h2o.grid <- function(algorithm, x, y, training_frame, - treatment_column = NULL, ..., hyper_params = list(), is_supervised = NULL, @@ -110,10 +109,8 @@ h2o.grid <- function(algorithm, } } if(algorithm %in% c("upliftdrf")){ - if(missing(treatment_column)) { + if(is.null(dots$treatment_column)) { stop("Must specify treatment column") - } else { - dots$treatment_column <- treatment_column } } algorithm <- .h2o.unifyAlgoName(algorithm)