Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-15791UpliftDRF Grid search #15948

Merged
merged 5 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion h2o-algos/src/main/java/hex/schemas/UpliftDRFV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static final class UpliftDRFParametersV3 extends SharedTreeV3.SharedTreeP
is_mutually_exclusive_with = {"ignored_columns","response_column", "weights_column"})
public String treatment_column;

@API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"})
@API(help = "Divergence metric used to find best split when building an uplift tree.", level = API.Level.secondary, values = { "AUTO", "KL", "Euclidean", "ChiSquared"}, gridable = true)
public UpliftDRFParameters.UpliftMetricType uplift_metric;

@API(help = "Metric used to calculate Area Under Uplift Curve.", level = API.Level.secondary, values = { "AUTO", "qini", "lift", "gain"})
Expand Down
36 changes: 36 additions & 0 deletions h2o-algos/src/test/java/hex/grid/GridTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import hex.grid.HyperSpaceWalker.BaseWalker.WalkerFactory;
import hex.tree.CompressedTree;
import hex.tree.gbm.GBMModel;
import hex.tree.uplift.UpliftDRFModel;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Rule;
Expand Down Expand Up @@ -1026,4 +1027,39 @@ public void test_parallel_grid_cancelled_on_consecutive_model_failures() {

}

@Test
public void testUpliftDrfGridSearch() {
try {
Scope.enter();

Frame train = Scope.track(parseTestFile("smalldata/uplift/criteo_uplift_13k.csv"));
train.toCategoricalCol("treatment");
train.toCategoricalCol("conversion");
DKV.put(train);
Scope.track(train);

HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {{
put("_ntrees", new Integer[]{5, 10, 20});
put("_max_depth", new Integer[]{5, 10});
}};

UpliftDRFModel.UpliftDRFParameters params = new UpliftDRFModel.UpliftDRFParameters();
params._train = train._key;
params._ignored_columns = new String[]{"visit", "exposure"};
params._treatment_column = "treatment";
params._response_column = "conversion";
params._seed = 0xDECAF;

Job<Grid> gs = GridSearch.create(null, params, hyperParms).withMaxConsecutiveFailures(10)
.withParallelism(3)
.start();
Scope.track_generic(gs);
final Grid grid = gs.get();
Scope.track_generic(grid);
assertEquals(6, grid.getModelCount());
} finally {
Scope.exit();
}
}

}
5 changes: 4 additions & 1 deletion h2o-core/src/main/java/hex/Model.java
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,9 @@ public boolean haveMojo() {
public GridSortBy getDefaultGridSortBy() {
if (! isSupervised())
return null;
else if (_output.nclasses() > 1)
else if (_output.hasTreatment()){
return GridSortBy.AUUC;
} else if (_output.nclasses() > 1)
return GridSortBy.LOGLOSS;
else
return GridSortBy.RESDEV;
Expand All @@ -319,6 +321,7 @@ public static class GridSortBy { // intentionally not an enum to allow 3rd party
public static final GridSortBy LOGLOSS = new GridSortBy("logloss", false);
public static final GridSortBy RESDEV = new GridSortBy("residual_deviance", false);
public static final GridSortBy R2 = new GridSortBy("r2", true);
public static final GridSortBy AUUC = new GridSortBy("auuc", false);

public final String _name;
public final boolean _decreasing;
Expand Down
11 changes: 9 additions & 2 deletions h2o-py/h2o/grid/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,15 @@ def build_model(self, algo_params):
training_frame = algo_params.pop("training_frame")
validation_frame = algo_params.pop("validation_frame", None)
is_auto_encoder = (algo_params is not None) and ("autoencoder" in algo_params and algo_params["autoencoder"])
is_uplift = (algo_params is not None) and ("treatment_column" in algo_params and algo_params["treatment_column"])
if is_auto_encoder and y is not None:
raise ValueError("y should not be specified for autoencoder.")
if self.model.supervised_learning:
if y is None:
raise ValueError("Missing response")
elif is_uplift:
y = y if y in training_frame.names else training_frame.names[y]
self.model._estimator_type = "binomial_uplift"
else:
y = y if y in training_frame.names else training_frame.names[y]
self.model._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
Expand All @@ -396,11 +400,12 @@ def _model_build(self, x, y, tframe, vframe, kwargs):
if y is not None: kwargs['response_column'] = y
if not is_type(x, list, tuple): x = [x]
if is_type(x[0], int):
x = [tframe.names[i] for i in x]
x = [tframe.names[i] for i in x]
offset = kwargs["offset_column"]
folds = kwargs["fold_column"]
weights = kwargs["weights_column"]
ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
treatment = kwargs["treatment_column"] if "treatment_column" in kwargs else None
ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights, treatment]))
kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
kwargs = {k: H2OEstimator._keyify(kwargs[k]) for k in kwargs}
if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
Expand Down Expand Up @@ -1482,6 +1487,8 @@ def _metrics_class(model_json):
model_class = H2ODimReductionGridSearch
elif model_type == "AnomalyDetection":
model_class = H2OBinomialGridSearch
elif model_type == "BinomialUplift":
model_class = H2OBinomialGridSearch
else:
raise NotImplementedError(model_type)
return model_class
Expand Down
24 changes: 24 additions & 0 deletions h2o-py/h2o/grid/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,3 +1271,27 @@ def mean_per_class_error(self, train=False, valid=False, xval=False):

class H2ORegressionGridSearch(object):
pass

#-----------------------------------------------------------------------------------------------------------------------
# Binomial Uplift Grid Search
#-----------------------------------------------------------------------------------------------------------------------


class H2OOBinomialUpliftGridSearch(object):

def auuc(self, train=False, valid=False):
return {model.model_id: model.auuc(train, valid) for model in self.models}

def qini(self, train=False, valid=False):
return {model.model_id: model.qini(train, valid) for model in self.models}

def ate(self, train=False, valid=False):
return {model.model_id: model.ate(train, valid) for model in self.models}

def att(self, train=False, valid=False):
return {model.model_id: model.att(train, valid) for model in self.models}

def atc(self, train=False, valid=False):
return {model.model_id: model.atc(train, valid) for model in self.models}


5 changes: 3 additions & 2 deletions h2o-py/tests/pyunit_utils/utilsPY.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,19 +734,20 @@ def make_random_grid_space(algo, ncols=None, nrows=None):
:return: a dictionary of parameter_name:list_of_values
"""
grid_space = {}
if algo in ["gbm", "rf"]:
if algo in ["gbm", "rf", "uplift"]:
if random.randint(0,1): grid_space['ntrees'] = random.sample(list(range(1,6)),random.randint(2,3))
if random.randint(0,1): grid_space['max_depth'] = random.sample(list(range(1,6)),random.randint(2,3))
if random.randint(0,1): grid_space['min_rows'] = random.sample(list(range(1,11)),random.randint(2,3))
if random.randint(0,1): grid_space['nbins'] = random.sample(list(range(2,21)),random.randint(2,3))
if random.randint(0,1): grid_space['nbins_cats'] = random.sample(list(range(2,1025)),random.randint(2,3))

if algo == "gbm":
if random.randint(0,1): grid_space['learn_rate'] = [random.random() for _ in range(random.randint(2,3))]
grid_space['distribution'] = random.sample(['bernoulli', 'multinomial', 'gaussian', 'poisson', 'tweedie', 'gamma'], 1)
if algo == "rf":
if random.randint(0,1): grid_space['mtries'] = random.sample(list(range(1,ncols+1)),random.randint(2,3))
if random.randint(0,1): grid_space['sample_rate'] = [random.random() for r in range(random.randint(2,3))]
if algo == "uplift":
grid_space['uplift_metric'] = random.sample(['KL','ChiSquared','Euclidean'], 1)
elif algo == "km":
grid_space['k'] = random.sample(list(range(1,10)),random.randint(2,3))
if random.randint(0,1): grid_space['max_iterations'] = random.sample(list(range(1,1000)),random.randint(2,3))
Expand Down
76 changes: 76 additions & 0 deletions h2o-py/tests/testdir_algos/uplift/pyunit_uplift_rf_grid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import print_function
import sys
sys.path.insert(1,"../../../")
import h2o
from tests import pyunit_utils
import copy
from h2o.estimators.uplift_random_forest import H2OUpliftRandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch


def grid_uplift_drf():

data = h2o.import_file(path=pyunit_utils.locate("smalldata/uplift/upliftml_train.csv"))
r = data[0].runif(seed=42)
train = data[r > .2]
valid = data[r <= .2]

grid_space = pyunit_utils.make_random_grid_space(algo="uplift")
print("Grid space: {0}".format(grid_space))

predictors = ["feature_"+str(x) for x in range(1,13)]
response_col = "outcome"
treatment_col = "treatment"
true_model_type = "binomial_uplift"

print("Predictors: {0}".format(predictors))
print("Response: {0}".format(response_col))

train[response_col] = train[response_col].asfactor()
valid[response_col] = valid[response_col].asfactor()
train[treatment_col] = train[treatment_col].asfactor()
valid[treatment_col] = valid[treatment_col].asfactor()

print("Constructing the grid of uplift drf models...")
uplift_grid = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=grid_space)
uplift_grid.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train,
validation_frame=valid)

print("Check correct type value....")
model_type = uplift_grid[0].type
assert model_type == true_model_type, "Type of model ({0}) is incorrect, expected value is {1}."\
.format(model_type, true_model_type)

print("Performing various checks of the constructed grid...")

print("Check cardinality of grid, that is, the correct number of models have been created...")
size_of_grid_space = 1
for v in list(grid_space.values()):
size_of_grid_space = size_of_grid_space * len(v)
actual_size = len(uplift_grid)
assert size_of_grid_space == actual_size, "Expected size of grid to be {0}, but got {1}".format(
size_of_grid_space, actual_size)

print("Duplicate-entries-in-grid-space check")
new_grid_space = copy.deepcopy(grid_space)
for name in list(grid_space.keys()):
if not name == "distribution":
new_grid_space[name] = grid_space[name] + grid_space[name]
print("The new search space: {0}".format(new_grid_space))
print("Constructing the new grid of gbm models...")
uplift_grid2 = H2OGridSearch(H2OUpliftRandomForestEstimator, hyper_params=new_grid_space)
uplift_grid2.train(x=predictors, y=response_col, treatment_column=treatment_col, training_frame=train,
validation_frame=valid)
actual_size2 = len(uplift_grid2)
assert actual_size == actual_size2, "Expected duplicates to be ignored. Without dups grid size: {0}. With dups " \
"size: {1}".format(actual_size, actual_size2)

print("Check that the hyper_params that were passed to grid, were used to construct the models...")
for name in list(grid_space.keys()):
pyunit_utils.expect_model_param(uplift_grid, name, grid_space[name])


if __name__ == "__main__":
pyunit_utils.standalone_test(grid_uplift_drf)
else:
grid_uplift_drf()
5 changes: 5 additions & 0 deletions h2o-r/h2o-package/R/grid.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ h2o.grid <- function(algorithm,
dots$x <- x
}
}
if(algorithm %in% c("upliftdrf")){
if(is.null(dots$treatment_column)) {
stop("Must specify treatment column")
}
}
algorithm <- .h2o.unifyAlgoName(algorithm)
model_param_names <- names(dots)
hyper_param_names <- names(hyper_params)
Expand Down
65 changes: 65 additions & 0 deletions h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(uplift)

check.uplift.grid <- function() {
data <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
print(summary(data))

data$treat <- ifelse(data$treat == 1, 1, 0)
data$treat <- as.factor(data$treat)
data$y <- as.factor(data$y)
data <- as.h2o(data)

x <- c("X1", "X2", "X3", "X4", "X5", "X6")
y <- "y"
treat <- "treat"

pretty.list <- function(ll) {
str <- lapply(ll, function(x) { paste0("(", paste(x, collapse = ","), ")", sep = "") })
paste(str, collapse = ",")
}
ntrees_opts <- c(1, 5)
max_depth_opts <- c(2, 5)
uplift_metric_opts <- c("KL", "Euclidean", "ChiSquared")
size_of_hyper_space <- length(ntrees_opts) * length(max_depth_opts) * length(uplift_metric_opts)

hyper_parameters <- list(ntrees=ntrees_opts, max_depth=max_depth_opts, uplift_metric=uplift_metric_opts)
Log.info(paste("UpliftDRF grid with the following hyper_parameters:", pretty.list(hyper_parameters)))
gg <- h2o.grid("upliftdrf", grid_id="upliftdrf_grid_test", x=x, y=y, training_frame=data, treatment_column=treat, hyper_params=hyper_parameters)

# Get models
gg_models <- lapply(gg@model_ids, function(mid) {
model <- h2o.getModel(mid)
})
# Check expected number of models
print(paste(length(gg@model_ids), "==", size_of_hyper_space))
expect_equal(length(gg_models), size_of_hyper_space)

# Check parameters coverage
# ntrees
expect_model_param(gg_models, "ntrees", ntrees_opts)

# Learn rate
expect_model_param(gg_models, "max_depth", max_depth_opts)

# uplift metric
expect_model_param(gg_models, "uplift_metric", uplift_metric_opts)

cat("\n\n Grid search results:")
print(gg)

# Test grid sorting
ascending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=FALSE)
descending <- h2o.getGrid(grid_id=gg@grid_id, sort_by="auuc", decreasing=TRUE)

ascending_model_ids <- ascending@model_ids
descending_model_ids <- descending@model_ids

expect_equal(length(ascending_model_ids), length(descending_model_ids))
expect_equal(length(ascending_model_ids), size_of_hyper_space)
expect_equal(rev(ascending_model_ids), descending_model_ids)
}

doTest("UpliftDRF Grid Search: iteration over parameters", check.uplift.grid)