From 47225b8e84fd5ab34e00f7c3873f99c78a19bbde Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Fri, 12 Jan 2024 09:26:29 +0100 Subject: [PATCH 1/8] Implement Uplift cross validation --- .../main/java/hex/tree/uplift/UpliftDRF.java | 8 +- .../java/hex/tree/uplift/UpliftDRFTest.java | 23 ++ h2o-core/src/main/java/hex/AUUC.java | 333 +++++++++++++++++- h2o-core/src/main/java/hex/ModelBuilder.java | 2 +- .../java/hex/ModelMetricsBinomialUplift.java | 12 +- 5 files changed, 368 insertions(+), 10 deletions(-) diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java index 66b5cd9259d5..9cd70e4722c9 100644 --- a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java @@ -97,11 +97,9 @@ public ModelCategory[] can_build() { if (hasOffsetCol()) error("_offset_column", "Offsets are not yet supported for Uplift DRF."); if (hasWeightCol()) - error("_weight_column", "Weights are not yet supported for Uplift DRF."); - if (hasFoldCol()) - error("_fold_column", "Cross-validation is not yet supported for Uplift DRF."); - if (_parms._nfolds > 0) - error("_nfolds", "Cross-validation is not yet supported for Uplift DRF."); + if(!_parms._weights_column.equals("__internal_cv_weights__")) { + error("_weight_column", "Weights are not yet supported for Uplift DRF."); + } if (_nclass == 1) error("_distribution", "UpliftDRF currently support binomial classification problems only."); if (_nclass > 2 || _parms._distribution.equals(DistributionFamily.multinomial)) diff --git a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java index eccc3c4b72c8..2194fb62157f 100644 --- a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java @@ -350,6 +350,29 @@ public void testBasicTrainSupportEarlyStoppingQini() { Scope.exit(); } } + + @Test + public void testBasicTrainSupportCV() { + try { + Scope.enter(); + Frame train = generateFrame(); + int ntrees = 100; + UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters(); + p._train = train._key; + p._treatment_column = "treatment"; + p._response_column = "conversion"; + p._ntrees = ntrees; + p._score_each_iteration = true; + p._nfolds = 3; + + UpliftDRF udrf = new UpliftDRF(p); + UpliftDRFModel model = udrf.trainModel().get(); + Scope.track_generic(model); + assertNotNull(model); + } finally { + Scope.exit(); + } + } @Test public void testMaxDepthZero() { diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 83a637f05b19..03f7e420e521 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -79,6 +79,10 @@ public AUUC(AUUCBuilder bldr, AUUCType auucType) { this(bldr, true, auucType); } + public AUUC(AUUCBuilder2 bldr, AUUCType auucType) { + this(bldr, true, auucType); + } + public AUUC(double[] customThresholds, Vec probs, Vec y, Vec uplift, AUUCType auucType) { this(new AUUCImpl(customThresholds).doAll(probs, y, uplift)._bldr, auucType); } @@ -146,6 +150,70 @@ public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _upliftNormalized = null; } } + + public AUUC(AUUCBuilder2 bldr, boolean trueProbabilities, AUUCType auucType) { + _auucType = auucType; + _auucTypeIndx = getIndexByAUUCType(_auucType); + _nBins = bldr._n; + //assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; + if (_nBins > 0) { + assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; + _n = bldr._frequency[_nBins-1]; + _ths = Arrays.copyOf(bldr._thresholds, _nBins); + _treatment = Arrays.copyOf(bldr._treatment, _nBins); + _control = Arrays.copyOf(bldr._control, _nBins); + _yTreatment = Arrays.copyOf(bldr._yTreatment, _nBins); + _yControl = Arrays.copyOf(bldr._yControl, _nBins); + _frequency = Arrays.copyOf(bldr._frequency, _nBins); + _frequencyCumsum = Arrays.copyOf(bldr._frequency, _nBins); + _uplift = new double[AUUCType.values().length][_nBins]; + _upliftRandom = new double[AUUCType.values().length][_nBins]; + _upliftNormalized = new double[AUUCType.values().length][_nBins]; + + // Rollup counts + long tmpt = 0, tmpc = 0, tmptp = 0, tmpcp = 0, tmpf = 0; + for (int i = 0; i < _nBins; i++) { + tmpt += _treatment[i]; + _treatment[i] = tmpt; + tmpc += _control[i]; + _control[i] = tmpc; + tmptp += _yTreatment[i]; + _yTreatment[i] = tmptp; + tmpcp += _yControl[i]; + _yControl[i] = tmpcp; + tmpf += _frequencyCumsum[i]; + _frequencyCumsum[i] = tmpf; + } + + // these methods need to be call in this order + setUplift(); + setUpliftRandom(); + setUpliftNormalized(); + + if (trueProbabilities) { + _auucs = computeAuucs(); + _auucsRandom = computeAuucsRandom(); + _aecu = computeAecu(); + _auucsNormalized = computeAuucsNormalized(); + _maxIdx = _auucType.maxCriterionIdx(this); + } else { + _maxIdx = 0; + } + } else { + _maxIdx = -1; + _n = 0; + _ths = null; + _treatment = null; + _control = null; + _yTreatment = null; + _yControl = null; + _frequency = null; + _frequencyCumsum = null; + _uplift = null; + _upliftRandom = null; + _upliftNormalized = null; + } + } public void setUplift(){ for(int i=0; i= 0 ) { // Found already in histogram; merge results - _n++; _frequency[idx]++; if(treatment == 1){ _treatment[idx]++; @@ -570,7 +556,6 @@ public void perRow(double pred, double w, double y, float treatment) { if (ssx == idx-1 || ssx == idx) _ssx = -1; // We don't know the minimum anymore double k = _frequency[idx]; - _n++; _frequency[idx]++; if(treatment == 1){ _treatment[idx]++; @@ -583,7 +568,7 @@ public void perRow(double pred, double w, double y, float treatment) { _yControl[idx]++; } } - _thresholds[idx] = combine_centers(_thresholds[idx], k, pred, w); + _thresholds[idx] = combineCenters(_thresholds[idx], k, pred, w); return; } } @@ -604,9 +589,8 @@ public void perRow(double pred, double w, double y, float treatment) { System.arraycopy(_frequency,idx,_frequency,idx+1,_n-idx); // Insert into the histogram _thresholds[idx] = pred; // New histogram center - _n++; _frequency[idx]++; - if(treatment == 1){ + if(treatment == 1) { _treatment[idx]++; if(y == 1){ _yTreatment[idx]++; @@ -617,8 +601,10 @@ public void perRow(double pred, double w, double y, float treatment) { _yControl[idx]++; } } - if( _n > _nBins ) // Merge as needed back down to nBins - mergeOneBin(); // Merge best pair of bins + _n++; + if( _n > _nBins ) { // Merge as needed back down to nBins + mergeOneBin(); // Merge best pair of bins + } } public void reduce(AUUC.AUUCBuilder2 bldr) { @@ -647,7 +633,7 @@ public void reduce(AUUC.AUUCBuilder2 bldr) { mergeOneBin(); } - static double combine_centers(double ths1, double n1, double ths0, double n0) { + static double combineCenters(double ths1, double n1, double ths0, double n0) { double center = (ths0 * n0 + ths1 * n1) / (n0 + n1); if (Double.isNaN(center) || Double.isInfinite(center)) { // use a simple average as a fallback @@ -665,13 +651,13 @@ private void mergeOneBin( ) { // centers based on counts. double k0 = _frequency[ssx]; double k1 = _frequency[ssx+1]; - _thresholds[ssx] = combine_centers(_thresholds[ssx], k0, _thresholds[ssx+1], k1); + _thresholds[ssx] = combineCenters(_thresholds[ssx], k0, _thresholds[ssx+1], k1); _treatment[ssx] += _treatment[ssx+1]; _control[ssx] += _control[ssx+1]; _yTreatment[ssx] += _yTreatment[ssx+1]; _yControl[ssx] += _yControl[ssx+1]; _frequency[ssx] += _frequency[ssx+1]; - int n = (int) _n; + int n = _n; // Slide over to crush the removed bin at index (ssx+1) System.arraycopy(_thresholds,ssx+2,_thresholds,ssx+1,n-ssx-2); System.arraycopy(_treatment,ssx+2,_treatment,ssx+1,n-ssx-2); diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java b/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java index ccbd0510001a..16288aec4790 100644 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java @@ -134,14 +134,20 @@ private static class UpliftBinomialMetrics extends MRTask String[] domain; double[] thresholds; public MetricBuilderBinomialUplift _mb; + int nbins; public UpliftBinomialMetrics(String[] domain, double[] thresholds) { this.domain = domain; this.thresholds = thresholds; } + public UpliftBinomialMetrics(String[] domain, int nbins) { + this.domain = domain; + this.nbins = nbins; + } + @Override public void map(Chunk[] chks) { - _mb = new MetricBuilderBinomialUplift(domain, thresholds); + _mb = new MetricBuilderBinomialUplift(domain, thresholds, nbins); Chunk uplift = chks[0]; Chunk actuals = chks[1]; Chunk treatment = chks[2]; @@ -165,11 +171,12 @@ public static class MetricBuilderBinomialUplift extends MetricBuilderSupervised< public double _sumTETreatment; public long _treatmentCount; - public MetricBuilderBinomialUplift( String[] domain, double[] thresholds) { + public MetricBuilderBinomialUplift( String[] domain, double[] thresholds, int nbins) { super(2,domain); if(thresholds != null) { _auuc = new AUUC.AUUCBuilder(thresholds); - _auuc2 = new AUUC.AUUCBuilder2(400); + } else { + _auuc2 = new AUUC.AUUCBuilder2(nbins); } } From c83d0e315eb71c1833229288d641bdcfdcf37fbc Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Sun, 5 May 2024 16:09:17 +0200 Subject: [PATCH 3/8] Implement AUUC reduce method correctly --- .../main/java/hex/generic/GenericModel.java | 2 +- .../main/java/hex/tree/SharedTreeModel.java | 2 +- .../java/hex/tree/uplift/UpliftDRFModel.java | 2 +- .../java/hex/tree/uplift/UpliftDRFTest.java | 37 --- h2o-core/src/main/java/hex/AUUC.java | 302 ++++-------------- .../java/hex/ModelMetricsBinomialUplift.java | 36 +-- h2o-core/src/test/java/hex/AUUCTest.java | 2 +- 7 files changed, 81 insertions(+), 302 deletions(-) diff --git a/h2o-algos/src/main/java/hex/generic/GenericModel.java b/h2o-algos/src/main/java/hex/generic/GenericModel.java index 15875f68a396..7da80bea0d2f 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModel.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModel.java @@ -137,7 +137,7 @@ public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { case AnomalyDetection: return new ModelMetricsAnomaly.MetricBuilderAnomaly(); case BinomialUplift: - return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, null, _parms._auuc_nbins); + return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, null, _parms._auuc_nbins,AUUC.calculateProbs(_parms._auuc_nbins)); default: throw H2O.unimpl(); } diff --git a/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java b/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java index 340cf4ddf70c..054a930cb961 100755 --- a/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java +++ b/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java @@ -166,7 +166,7 @@ public boolean forceStrictlyReproducibleHistograms() { case Binomial: return new ModelMetricsBinomial.MetricBuilderBinomial(domain); case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(_output.nclasses(),domain, _parms._auc_type); case Regression: return new ModelMetricsRegression.MetricBuilderRegression(); - case BinomialUplift: return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, ((UpliftDRFModel.UpliftDRFOutput)_output)._defaultAuucThresholds, _parms._auuc_nbins); + case BinomialUplift: return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, ((UpliftDRFModel.UpliftDRFOutput)_output)._defaultAuucThresholds, _parms._auuc_nbins, AUUC.calculateProbs( _parms._auuc_nbins)); default: throw H2O.unimpl(); } } diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java index ba0b6f3711e6..f865758481fe 100644 --- a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java @@ -84,7 +84,7 @@ public void initActualParamValues() { } @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { - return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, _output._defaultAuucThresholds, _parms._auuc_nbins); + return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, _output._defaultAuucThresholds, _parms._auuc_nbins, AUUC.calculateProbs(_parms._auuc_nbins)); } @Override diff --git a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java index 8c0e53cd4dd6..85497b11a92a 100644 --- a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java @@ -155,43 +155,6 @@ public void testBasicTrainErrorDoNotSupportMultinomialResponseColumn() { } } - - @Test(expected = H2OModelBuilderIllegalArgumentException.class) - public void testBasicTrainErrorDoNotSupportNfolds() { - try { - Scope.enter(); - Frame train = generateFrame(); - UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters(); - p._train = train._key; - p._treatment_column = "treatment"; - p._response_column = "conversion"; - p._nfolds = 10; - - UpliftDRF udrf = new UpliftDRF(p); - udrf.trainModel().get(); - } finally { - Scope.exit(); - } - } - - @Test(expected = H2OModelBuilderIllegalArgumentException.class) - public void testBasicTrainErrorDoNotSupportFoldColumn() { - try { - Scope.enter(); - Frame train = generateFrame(); - UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters(); - p._train = train._key; - p._treatment_column = "treatment"; - p._response_column = "conversion"; - p._fold_column = "C0"; - - UpliftDRF udrf = new UpliftDRF(p); - udrf.trainModel().get(); - } finally { - Scope.exit(); - } - } - @Test(expected = H2OModelBuilderIllegalArgumentException.class) public void testBasicTrainErrorDoNotSupportOffset() { try { diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 41a8b5601661..2847788cdd16 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -71,26 +71,22 @@ public double[] upliftRandomByType(AUUCType type){ return idx < 0 ? null : _upliftRandom[idx]; } - public AUUC(Vec probs, Vec y, Vec uplift, AUUCType auucType, int nbins) { - this(new AUUCImpl(calculateQuantileThresholds(nbins, probs)).doAll(probs, y, uplift)._bldr, auucType); + public AUUC(Vec preds, Vec y, Vec uplift, AUUCType auucType, int nbins, double[] probs) { + this(new AUUCImpl(calculateQuantileThresholds(probs, preds), nbins, probs).doAll(preds, y, uplift)._bldr, auucType); } public AUUC(AUUCBuilder bldr, AUUCType auucType) { this(bldr, true, auucType); } - - public AUUC(AUUCBuilder2 bldr, AUUCType auucType) { - this(bldr, true, auucType); - } - - public AUUC(double[] customThresholds, Vec probs, Vec y, Vec uplift, AUUCType auucType) { - this(new AUUCImpl(customThresholds).doAll(probs, y, uplift)._bldr, auucType); + + public AUUC(double[] customThresholds, Vec preds, Vec y, Vec uplift, AUUCType auucType, int nbins, double[] probs) { + this(new AUUCImpl(customThresholds, nbins, probs).doAll(preds, y, uplift)._bldr, auucType); } public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _auucType = auucType; _auucTypeIndx = getIndexByAUUCType(_auucType); - _nBins = bldr._nBins; + _nBins = bldr._nbins; //assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; if (_nBins > 0) { assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; @@ -150,70 +146,6 @@ public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _upliftNormalized = null; } } - - public AUUC(AUUCBuilder2 bldr, boolean trueProbabilities, AUUCType auucType) { - _auucType = auucType; - _auucTypeIndx = getIndexByAUUCType(_auucType); - _nBins = bldr._n; - //assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; - if (_nBins > 0) { - assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; - _n = bldr._frequency[_nBins-1]; - _ths = Arrays.copyOf(bldr._thresholds, _nBins); - _treatment = Arrays.copyOf(bldr._treatment, _nBins); - _control = Arrays.copyOf(bldr._control, _nBins); - _yTreatment = Arrays.copyOf(bldr._yTreatment, _nBins); - _yControl = Arrays.copyOf(bldr._yControl, _nBins); - _frequency = Arrays.copyOf(bldr._frequency, _nBins); - _frequencyCumsum = Arrays.copyOf(bldr._frequency, _nBins); - _uplift = new double[AUUCType.values().length][_nBins]; - _upliftRandom = new double[AUUCType.values().length][_nBins]; - _upliftNormalized = new double[AUUCType.values().length][_nBins]; - - // Rollup counts - long tmpt = 0, tmpc = 0, tmptp = 0, tmpcp = 0, tmpf = 0; - for (int i = 0; i < _nBins; i++) { - tmpt += _treatment[i]; - _treatment[i] = tmpt; - tmpc += _control[i]; - _control[i] = tmpc; - tmptp += _yTreatment[i]; - _yTreatment[i] = tmptp; - tmpcp += _yControl[i]; - _yControl[i] = tmpcp; - tmpf += _frequencyCumsum[i]; - _frequencyCumsum[i] = tmpf; - } - - // these methods need to be call in this order - setUplift(); - setUpliftRandom(); - setUpliftNormalized(); - - if (trueProbabilities) { - _auucs = computeAuucs(); - _auucsRandom = computeAuucsRandom(); - _aecu = computeAecu(); - _auucsNormalized = computeAuucsNormalized(); - _maxIdx = _auucType.maxCriterionIdx(this); - } else { - _maxIdx = 0; - } - } else { - _maxIdx = -1; - _n = 0; - _ths = null; - _treatment = null; - _control = null; - _yTreatment = null; - _yControl = null; - _frequency = null; - _frequencyCumsum = null; - _uplift = null; - _upliftRandom = null; - _upliftNormalized = null; - } - } public void setUplift(){ for(int i=0; imake(), new String[]{"predictions"}, new Vec[]{preds}); DKV.put(fr); qp._train = fr._key; - assert groups > 0; - qp._probs = new double[groups]; - for (int i = 0; i < groups; ++i) { - qp._probs[i] = (groups - i - 1.) / groups; // This is 0.9, 0.8, 0.7, 0.6, ..., 0.1, 0 for 10 groups - } + qp._probs = probs; qm = new Quantile(qp).trainModel().get(); quantiles = qm._output._quantiles[0]; // find uniques @@ -405,15 +342,20 @@ public double auucRandom(int idx){ public double auucNormalized(){ return auucNormalized(_auucTypeIndx); } public static class AUUCImpl extends MRTask { + + final int _nbins; final double[] _thresholds; + final double[] _probs; AUUCBuilder _bldr; - public AUUCImpl(double[] thresholds) { + public AUUCImpl(double[] thresholds, int nbins, double[] probs) { _thresholds = thresholds; + _nbins = nbins; + _probs = probs; } @Override public void map(Chunk ps, Chunk actuals, Chunk treatment) { - AUUCBuilder bldr = _bldr = new AUUCBuilder(_thresholds); + AUUCBuilder bldr = _bldr = new AUUCBuilder(_nbins, _thresholds, _probs); for( int row = 0; row < ps._len; row++ ) if( !ps.isNA(row) && !treatment.isNA(row) ) bldr.perRow(ps.atd(row),1, actuals.atd(row), (float) treatment.atd(row)); @@ -425,25 +367,29 @@ public AUUCImpl(double[] thresholds) { * Builder to process input data to build histogram in parallel. This builder is used to calculate AUUC quickly. */ public static class AUUCBuilder extends Iced { - final int _nBins; + final int _nbins; final double[]_thresholds; // thresholds final long[] _treatment; // number of data from treatment group final long[] _control; // number of data from control group final long[] _yTreatment; // number of data from treatment group with prediction = 1 final long[] _yControl; // number of data from control group with prediction = 1 final long[] _frequency; // frequency of data in each bin - long _n; + double[] _probs; + int _n; // number of data + int _nUsed; // number of used bins int _ssx; - public AUUCBuilder(double[] thresholds) { - int nBins = thresholds.length; - _nBins = nBins; - _thresholds = thresholds; - _treatment = new long[nBins]; - _control = new long[nBins]; - _yTreatment = new long[nBins]; - _yControl = new long[nBins]; - _frequency = new long[nBins]; + public AUUCBuilder(int nbins, double[] thresholds, double[] probs) { + int tlen = thresholds != null ? thresholds.length : 1; + _probs = probs; + _nbins = nbins; + _nUsed = tlen; + _thresholds = thresholds == null ? new double[]{0} : thresholds; + _treatment = new long[tlen]; + _control = new long[tlen]; + _yTreatment = new long[tlen]; + _yControl = new long[tlen]; + _frequency = new long[tlen]; _ssx = -1; } @@ -477,137 +423,8 @@ public void reduce(AUUCBuilder bldr) { ArrayUtils.add(_yControl, bldr._yControl); ArrayUtils.add(_frequency, bldr._frequency); } - - private String toDebugString() { - return "n =" +_n + - "; nBins = " + _nBins + - "; ths = " + Arrays.toString(_thresholds) + - "; treatment = " + Arrays.toString(_treatment) + - "; contribution = " + Arrays.toString(_control) + - "; yTreatment = " + Arrays.toString(_yTreatment) + - "; yContribution = " + Arrays.toString(_yControl) + - "; frequency = " + Arrays.toString(_frequency); - } - } - - /** - * Builder to process input data to build histogram in parallel. This builder is used to calculate AUUC quickly. - */ - public static class AUUCBuilder2 extends Iced { - final int _nBins; - final double[]_thresholds; // thresholds - final long[] _treatment; // number of data from treatment group - final long[] _control; // number of data from control group - final long[] _yTreatment; // number of data from treatment group with prediction = 1 - final long[] _yControl; // number of data from control group with prediction = 1 - final long[] _frequency; // frequency of data in each bin - int _n; - int _ssx; - public AUUCBuilder2(int nBins) { - _nBins = nBins; - int doubleNBins = 2 * nBins; - _thresholds = new double[doubleNBins]; - _treatment = new long[doubleNBins]; - _control = new long[doubleNBins]; - _yTreatment = new long[doubleNBins]; - _yControl = new long[doubleNBins]; - _frequency = new long[doubleNBins]; - _ssx = -1; - } - - public void perRow(double pred, double w, double y, float treatment) { - // Insert the prediction into the set of histograms in sorted order, as - // if its a new histogram bin with 1 count. - assert !Double.isNaN(pred); - assert !Double.isNaN(w) && !Double.isInfinite(w); - int idx = Arrays.binarySearch(_thresholds,0,_n,pred); - if( idx >= 0 ) { // Found already in histogram; merge results - _frequency[idx]++; - if(treatment == 1){ - _treatment[idx]++; - if(y == 1){ - _yTreatment[idx]++; - } - } else { - _control[idx]++; - if(y == 1){ - _yControl[idx]++; - } - } - _ssx = -1; // Blows the known best merge - return; - } - idx = -idx-1; // Get index to insert at - - // If already full bins, try to instantly merge into an existing bin - if (_n == _nBins && - idx > 0 && idx < _n && // Give up for the corner cases - _thresholds[idx - 1] != _thresholds[idx]) // Histogram has duplicates (mergeOneBin will get rid of them) - { // Need to merge to shrink things - final int ssx = find_smallest(); - double dssx = compute_delta_error(_thresholds[ssx+1], _frequency[ssx+1], _thresholds[ssx], _frequency[ssx]); - // See if this point will fold into either the left or right bin - // immediately. This is the desired fast-path. - double d0 = compute_delta_error(pred,w,_thresholds[idx-1],_frequency[idx-1]); - double d1 = compute_delta_error(_thresholds[idx], _frequency[idx],pred,w); - if (d0 < dssx || d1 < dssx) { - if (d0 <= d1) idx--; // Pick correct bin - if (ssx == idx-1 || ssx == idx) - _ssx = -1; // We don't know the minimum anymore - double k = _frequency[idx]; - _frequency[idx]++; - if(treatment == 1){ - _treatment[idx]++; - if(y == 1){ - _yTreatment[idx]++; - } - } else { - _control[idx]++; - if(y == 1){ - _yControl[idx]++; - } - } - _thresholds[idx] = combineCenters(_thresholds[idx], k, pred, w); - return; - } - } - - // Must insert this point as it's own threshold (which is not insertion - // point), either because we have too few bins or because we cannot - // instantly merge the new point into an existing bin. - if (idx == 0 || idx == _n || // Just because we didn't bother to deal with the corner cases ^^^ - idx == _ssx) _ssx = -1; // Smallest error becomes one of the splits - else if( idx < _ssx ) _ssx++; // Smallest error will slide right 1 - - // Slide over to do the insert. Horrible slowness. - System.arraycopy(_thresholds,idx,_thresholds,idx+1,_n-idx); - System.arraycopy(_treatment,idx,_treatment,idx+1,_n-idx); - System.arraycopy(_control,idx,_control,idx+1,_n-idx); - System.arraycopy(_yTreatment,idx,_yTreatment,idx+1,_n-idx); - System.arraycopy(_yControl,idx,_yControl,idx+1,_n-idx); - System.arraycopy(_frequency,idx,_frequency,idx+1,_n-idx); - // Insert into the histogram - _thresholds[idx] = pred; // New histogram center - _frequency[idx]++; - if(treatment == 1) { - _treatment[idx]++; - if(y == 1){ - _yTreatment[idx]++; - } - } else { - _control[idx]++; - if(y == 1){ - _yControl[idx]++; - } - } - _n++; - if( _n > _nBins ) { // Merge as needed back down to nBins - mergeOneBin(); // Merge best pair of bins - } - } - - public void reduce(AUUC.AUUCBuilder2 bldr) { + public void reduce2(AUUCBuilder bldr) { // Merge sort the 2 sorted lists into the double-sized arrays. The tail // half of the double-sized array is unused, but the front half is // probably a source. Merge into the back. @@ -615,7 +432,7 @@ public void reduce(AUUC.AUUCBuilder2 bldr) { int y = bldr._n-1; while( x+y+1 >= 0 ) { boolean self_is_larger = y < 0 || (x >= 0 && _thresholds[x] >= bldr._thresholds[y]); - AUUC.AUUCBuilder2 b = self_is_larger ? this : bldr; + AUUCBuilder b = self_is_larger ? this : bldr; int idx = self_is_larger ? x : y ; _thresholds[x+y+1] = b._thresholds[idx]; _treatment[x+y+1] = b._treatment[idx]; @@ -626,15 +443,17 @@ public void reduce(AUUC.AUUCBuilder2 bldr) { if( self_is_larger ) x--; else y--; } _n += bldr._n; + _ssx = -1; // Merge elements with least squared-error increase until we get fewer // than _nBins and no duplicates. May require many merges. - while( _n > _nBins || dups() ) + while( _n > _nbins || dups() ) mergeOneBin(); } - static double combineCenters(double ths1, double n1, double ths0, double n0) { - double center = (ths0 * n0 + ths1 * n1) / (n0 + n1); + static double combineCenters(double ths1, double ths0, double probs, long nrows) { + //double center = (ths0 * n0 + ths1 * n1) / (n0 + n1); + double center = computeLinearInterpolation(ths1, ths0, nrows, probs); if (Double.isNaN(center) || Double.isInfinite(center)) { // use a simple average as a fallback return (ths0 + ths1) / 2; @@ -645,13 +464,11 @@ static double combineCenters(double ths1, double n1, double ths0, double n0) { private void mergeOneBin( ) { // Too many bins; must merge bins. Merge into bins with least total // squared error. Horrible slowness linear arraycopy. - int ssx = find_smallest(); + int ssx = findSmallest(); // Merge two bins. Classic bins merging by averaging the histogram // centers based on counts. - double k0 = _frequency[ssx]; - double k1 = _frequency[ssx+1]; - _thresholds[ssx] = combineCenters(_thresholds[ssx], k0, _thresholds[ssx+1], k1); + _thresholds[ssx] = combineCenters(_thresholds[ssx], _thresholds[ssx+1], _probs[ssx], _n); _treatment[ssx] += _treatment[ssx+1]; _control[ssx] += _control[ssx+1]; _yTreatment[ssx] += _yTreatment[ssx+1]; @@ -666,6 +483,7 @@ private void mergeOneBin( ) { System.arraycopy(_yControl,ssx+2,_yControl,ssx+1,n-ssx-2); System.arraycopy(_frequency,ssx+2,_frequency,ssx+1,n-ssx-2); _n--; + _thresholds[_n] = _treatment[_n] = _control[_n] = _yTreatment[_n] = _yControl[_n] = _frequency[_n] = 0; _ssx = -1; } @@ -676,15 +494,15 @@ private void mergeOneBin( ) { // but this leads to bad errors if the probabilities are sorted. Also // tried the original: merge bins with the least distance between bin // centers. Same problem for sorted data. - private int find_smallest() { + private int findSmallest() { if( _ssx == -1 ) { - _ssx = find_smallest_impl(); + _ssx = findSmallestImpl(); assert _ssx != -1 : toDebugString(); } return _ssx; } - private int find_smallest_impl() { + private int findSmallestImpl() { if (_n == 1) return 0; // we couldn't find any bins to merge based on SE (the math can be producing Double.Infinity or Double.NaN) @@ -705,14 +523,14 @@ private int find_smallest_impl() { private boolean dups() { long n = _n; for( int i=0; i double[] thresholds; public MetricBuilderBinomialUplift _mb; int nbins; + double[] probs; - public UpliftBinomialMetrics(String[] domain, double[] thresholds) { + public UpliftBinomialMetrics(String[] domain, double[] thresholds, int nbins, double[] probs) { this.domain = domain; this.thresholds = thresholds; - } - - public UpliftBinomialMetrics(String[] domain, int nbins) { - this.domain = domain; + this.probs = probs; this.nbins = nbins; } @Override public void map(Chunk[] chks) { - _mb = new MetricBuilderBinomialUplift(domain, thresholds, nbins); + _mb = new MetricBuilderBinomialUplift(domain, thresholds, nbins, probs); Chunk uplift = chks[0]; Chunk actuals = chks[1]; Chunk treatment = chks[2]; @@ -166,18 +165,13 @@ public UpliftBinomialMetrics(String[] domain, int nbins) { public static class MetricBuilderBinomialUplift extends MetricBuilderSupervised { protected AUUC.AUUCBuilder _auuc; - protected AUUC.AUUCBuilder2 _auuc2; public double _sumTE; public double _sumTETreatment; public long _treatmentCount; - public MetricBuilderBinomialUplift( String[] domain, double[] thresholds, int nbins) { + public MetricBuilderBinomialUplift( String[] domain, double[] thresholds, int nbins, double[] probs) { super(2,domain); - if(thresholds != null) { - _auuc = new AUUC.AUUCBuilder(thresholds); - } else { - _auuc2 = new AUUC.AUUCBuilder2(nbins); - } + _auuc = new AUUC.AUUCBuilder(nbins, thresholds, probs); } @Override public double[] perRow(double[] ds, float[] yact, Model m) { @@ -203,19 +197,13 @@ public double[] perRow(double[] ds, float[] yact, double weight, double offset, if (_auuc != null) { _auuc.perRow(treatmentEffect, weight, y, treatmentGroup); } - if(_auuc2 != null){ - _auuc2.perRow(treatmentEffect, weight, y, treatmentGroup); - } return ds; } @Override public void reduce(MetricBuilderBinomialUplift mb ) { super.reduce(mb); if(_auuc != null) { - // _auuc.reduce(mb._auuc); - } - if(_auuc2 != null) { - _auuc2.reduce(mb._auuc2); + _auuc.reduce(mb._auuc); } _sumTE += mb._sumTE; _sumTETreatment += mb._sumTETreatment; @@ -256,7 +244,7 @@ private ModelMetrics makeModelMetrics(final Model m, final Frame f, final Frame AUUC auuc = null; if (preds != null) { if (resp != null) { - auuc = new AUUC(preds.vec(0), resp, treatment, auucType, nbins); + auuc = new AUUC(preds.vec(0), resp, treatment, auucType, nbins, AUUC.calculateProbs(nbins)); } } return makeModelMetrics(m, f, auuc); @@ -274,7 +262,7 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, AUUC auuc) { if(_wcount > 0) { if (auuc == null) { sigma = weightedSigma(); - auuc = new AUUC(_auuc2, m._parms._auuc_type); + auuc = new AUUC(_auuc, m._parms._auuc_type); } ate = _sumTE/_wcount; att = _sumTETreatment/_treatmentCount; diff --git a/h2o-core/src/test/java/hex/AUUCTest.java b/h2o-core/src/test/java/hex/AUUCTest.java index 381406bfc601..4f7922f3823c 100644 --- a/h2o-core/src/test/java/hex/AUUCTest.java +++ b/h2o-core/src/test/java/hex/AUUCTest.java @@ -94,7 +94,7 @@ private static AUUC doAUUC(int nbins, double[] probs, double[] y, double[] treat } Frame fr = ArrayUtils.frame(new String[]{"probs", "y", "treatment"}, rows); fr.vec("treatment").setDomain(new String[]{"0", "1"}); - AUUC auuc = new AUUC(fr.vec("probs"),fr.vec("y"), fr.vec("treatment"), type, nbins); + AUUC auuc = new AUUC(fr.vec("probs"),fr.vec("y"), fr.vec("treatment"), type, nbins, AUUC.calculateProbs(nbins)); fr.remove(); return auuc; } From 5c6869ba42c41292aa7924f32566c6e40cd57d0f Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 8 May 2024 15:57:22 +0200 Subject: [PATCH 4/8] Implement AUUC reduce correctly --- .../java/hex/tree/uplift/UpliftDRFTest.java | 2 +- h2o-core/src/main/java/hex/AUUC.java | 131 ++++++++++-------- 2 files changed, 77 insertions(+), 56 deletions(-) diff --git a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java index 85497b11a92a..33c6c5b5d964 100644 --- a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java @@ -353,7 +353,7 @@ public void testSupportCVCriteo() { p._treatment_column = "treatment"; p._response_column = "conversion"; p._seed = 0xDECAF; - p._ntrees = 10; + p._ntrees = 11; p._score_each_iteration = true; p._nfolds = 3; p._auuc_nbins = 50; diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 2847788cdd16..76aa17f722c3 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -368,7 +368,7 @@ public AUUCImpl(double[] thresholds, int nbins, double[] probs) { */ public static class AUUCBuilder extends Iced { final int _nbins; - final double[]_thresholds; // thresholds + final double[] _thresholds; // thresholds final long[] _treatment; // number of data from treatment group final long[] _control; // number of data from control group final long[] _yTreatment; // number of data from treatment group with prediction = 1 @@ -376,25 +376,31 @@ public static class AUUCBuilder extends Iced { final long[] _frequency; // frequency of data in each bin double[] _probs; int _n; // number of data - int _nUsed; // number of used bins + int _nbinsUsed; // number of used bins int _ssx; public AUUCBuilder(int nbins, double[] thresholds, double[] probs) { - int tlen = thresholds != null ? thresholds.length : 1; _probs = probs; _nbins = nbins; - _nUsed = tlen; - _thresholds = thresholds == null ? new double[]{0} : thresholds; - _treatment = new long[tlen]; - _control = new long[tlen]; - _yTreatment = new long[tlen]; - _yControl = new long[tlen]; - _frequency = new long[tlen]; + _nbinsUsed = thresholds != null ? thresholds.length : 0; + int l = nbins * 2; // maximal possible builder arrays length + _thresholds = new double[l]; + if (thresholds != null) { + System.arraycopy(thresholds, 0, _thresholds, 0, thresholds.length); + } + _probs = new double[l]; + System.arraycopy(probs, 0, _probs, 0, probs.length); + System.arraycopy(probs, 0, _probs, probs.length-1, probs.length); + _treatment = new long[l]; + _control = new long[l]; + _yTreatment = new long[l]; + _yControl = new long[l]; + _frequency = new long[l]; _ssx = -1; } public void perRow(double pred, double w, double y, float treatment) { - if (w == 0) {return;} + if (w == 0 || _thresholds == null) {return;} for(int t = 0; t < _thresholds.length; t++) { if (pred >= _thresholds[t] && (t == 0 || pred <_thresholds[t-1])) { _n++; @@ -416,20 +422,23 @@ public void perRow(double pred, double w, double y, float treatment) { } public void reduce(AUUCBuilder bldr) { - _n += bldr._n; - ArrayUtils.add(_treatment, bldr._treatment); - ArrayUtils.add(_control, bldr._control); - ArrayUtils.add(_yTreatment, bldr._yTreatment); - ArrayUtils.add(_yControl, bldr._yControl); - ArrayUtils.add(_frequency, bldr._frequency); + if(bldr._nbinsUsed == 0) {return;} + if(_nbinsUsed == 0 || _thresholds == bldr._thresholds){ + reduceSameOrNullThresholds(bldr); + } else { + reduceDifferentThresholds(bldr); + } } - public void reduce2(AUUCBuilder bldr) { - // Merge sort the 2 sorted lists into the double-sized arrays. The tail - // half of the double-sized array is unused, but the front half is - // probably a source. Merge into the back. - int x = _n-1; - int y = bldr._n-1; + /** + * Merge sort the 2 sorted lists into the double-sized arrays. The tail + * half of the double-sized array is unused, but the front half is + * probably a source. Merge into the back. + * @param bldr AUUC builder to reduce + */ + public void reduceDifferentThresholds(AUUCBuilder bldr){ + int x = _nbinsUsed -1; + int y = bldr._nbinsUsed -1; while( x+y+1 >= 0 ) { boolean self_is_larger = y < 0 || (x >= 0 && _thresholds[x] >= bldr._thresholds[y]); AUUCBuilder b = self_is_larger ? this : bldr; @@ -440,16 +449,31 @@ public void reduce2(AUUCBuilder bldr) { _yTreatment[x+y+1] = b._yTreatment[idx]; _yControl[x+y+1] = b._yControl[idx]; _frequency[x+y+1] = b._frequency[idx]; + _probs[x+y+1] = b._probs[idx]; if( self_is_larger ) x--; else y--; } _n += bldr._n; + _nbinsUsed += bldr._nbinsUsed; _ssx = -1; // Merge elements with least squared-error increase until we get fewer // than _nBins and no duplicates. May require many merges. - while( _n > _nbins || dups() ) + while( _nbinsUsed > _nbins || dups() ) mergeOneBin(); } + + public void reduceSameOrNullThresholds(AUUCBuilder bldr){ + _n += bldr._n; + if(_nbinsUsed == 0) { + ArrayUtils.add(_thresholds, bldr._thresholds); + _nbinsUsed = bldr._nbinsUsed; + } + ArrayUtils.add(_treatment, bldr._treatment); + ArrayUtils.add(_control, bldr._control); + ArrayUtils.add(_yTreatment, bldr._yTreatment); + ArrayUtils.add(_yControl, bldr._yControl); + ArrayUtils.add(_frequency, bldr._frequency); + } static double combineCenters(double ths1, double ths0, double probs, long nrows) { //double center = (ths0 * n0 + ths1 * n1) / (n0 + n1); @@ -474,26 +498,22 @@ private void mergeOneBin( ) { _yTreatment[ssx] += _yTreatment[ssx+1]; _yControl[ssx] += _yControl[ssx+1]; _frequency[ssx] += _frequency[ssx+1]; - int n = _n; + int n = _nbinsUsed == 2 ? _nbinsUsed - ssx -1 : _nbinsUsed - ssx -2; // Slide over to crush the removed bin at index (ssx+1) - System.arraycopy(_thresholds,ssx+2,_thresholds,ssx+1,n-ssx-2); - System.arraycopy(_treatment,ssx+2,_treatment,ssx+1,n-ssx-2); - System.arraycopy(_control,ssx+2,_control,ssx+1,n-ssx-2); - System.arraycopy(_yTreatment,ssx+2,_yTreatment,ssx+1,n-ssx-2); - System.arraycopy(_yControl,ssx+2,_yControl,ssx+1,n-ssx-2); - System.arraycopy(_frequency,ssx+2,_frequency,ssx+1,n-ssx-2); - _n--; - _thresholds[_n] = _treatment[_n] = _control[_n] = _yTreatment[_n] = _yControl[_n] = _frequency[_n] = 0; + System.arraycopy(_thresholds,ssx+2,_thresholds,ssx+1,n); + System.arraycopy(_treatment,ssx+2,_treatment,ssx+1,n); + System.arraycopy(_control,ssx+2,_control,ssx+1,n); + System.arraycopy(_yTreatment,ssx+2,_yTreatment,ssx+1,n); + System.arraycopy(_yControl,ssx+2,_yControl,ssx+1,n); + System.arraycopy(_frequency,ssx+2,_frequency,ssx+1,n); + _nbinsUsed--; _ssx = -1; } - // Find the pair of bins that when combined give the smallest increase in - // squared error. Dups never increase squared error. - // - // I tried code for merging bins with keeping the bins balanced in size, - // but this leads to bad errors if the probabilities are sorted. Also - // tried the original: merge bins with the least distance between bin - // centers. Same problem for sorted data. + /** + * Find the pair of bins that when combined give the smallest difference in thresholds + * @return index of the bin where the threshold difference is the smallest + */ private int findSmallest() { if( _ssx == -1 ) { _ssx = findSmallestImpl(); @@ -503,12 +523,10 @@ private int findSmallest() { } private int findSmallestImpl() { - if (_n == 1) + if (_nbinsUsed == 1) return 0; - // we couldn't find any bins to merge based on SE (the math can be producing Double.Infinity or Double.NaN) - // revert to using a simple distance of the bin centers int minI = 0; - long n = _n; + long n = _nbinsUsed; double minDist = _thresholds[1] - _thresholds[0]; for (int i = 1; i < n - 1; i++) { double dist = _thresholds[i + 1] - _thresholds[i]; @@ -521,25 +539,27 @@ private int findSmallestImpl() { } private boolean dups() { - long n = _n; + long n = _nbinsUsed; for( int i=0; i Date: Wed, 15 May 2024 11:49:40 +0200 Subject: [PATCH 7/8] Fix randomForest tests to pass on master --- h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R | 2 +- h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R index cf42dfd27aea..b40357d1ddbf 100644 --- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R +++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_bigcat.R @@ -2,7 +2,7 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../../scripts/h2o-r-test-setup.R") -library(randomForest) +#library(randomForest) test.DRF.bigcat <- function() { # Training set has 100 categories from cat001 to cat100 diff --git a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R index 395ffa564b51..0582b600afab 100644 --- a/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R +++ b/h2o-r/tests/testdir_algos/randomforest/runit_RF_smallcat.R @@ -2,7 +2,7 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) source("../../../scripts/h2o-r-test-setup.R") -library(randomForest) +#library(randomForest) test.DRF.smallcat <- function() { # Training set has 26 categories from A to Z From 1dfae1392efa2d54aae02b935ce8845a87b52464 Mon Sep 17 00:00:00 2001 From: Veronika Maurerova Date: Wed, 22 May 2024 11:10:49 +0200 Subject: [PATCH 8/8] Debug tests failed on jenkins --- h2o-core/src/main/java/hex/AUUC.java | 30 +++++++++++++------ .../src/main/java/water/util/ArrayUtils.java | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 76aa17f722c3..594e5c9b1d5f 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -86,10 +86,8 @@ public AUUC(double[] customThresholds, Vec preds, Vec y, Vec uplift, AUUCType au public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _auucType = auucType; _auucTypeIndx = getIndexByAUUCType(_auucType); - _nBins = bldr._nbins; - //assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; + _nBins = bldr._nbinsUsed; if (_nBins > 0) { - assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; _n = bldr._n; _ths = Arrays.copyOf(bldr._thresholds, _nBins); _treatment = Arrays.copyOf(bldr._treatment, _nBins); @@ -117,6 +115,9 @@ public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _frequencyCumsum[i] = tmpf; } + System.out.println(Arrays.toString(_treatment)); + System.out.println(Arrays.toString(_control)); + // these methods need to be call in this order setUplift(); setUpliftRandom(); @@ -148,16 +149,19 @@ public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { } public void setUplift(){ - for(int i=0; i 0 ? treatment / (double)control : 1; return yTreatment - yControl * norm; } }, lift() { @Override double exec(long treatment, long control, long yTreatment, long yControl) { - return yTreatment / (double) treatment - yControl / (double)control; + if (treatment > 0 && control > 0) { + return yTreatment / (double) treatment - yControl / (double) control; + } else if (treatment < 0 && control > 0) { + return - (yControl / (double) control); + } else if (treatment > 0 && control < 0) { + return yTreatment / (double) treatment; + } else { + return Double.NaN; + } } }, gain() { diff --git a/h2o-core/src/main/java/water/util/ArrayUtils.java b/h2o-core/src/main/java/water/util/ArrayUtils.java index cc603198cf5e..692244fc275a 100644 --- a/h2o-core/src/main/java/water/util/ArrayUtils.java +++ b/h2o-core/src/main/java/water/util/ArrayUtils.java @@ -2248,7 +2248,7 @@ public static double[] uniformDistFromArray(double[][] array, long seed) { */ public static void interpolateLinear(double[] array){ assert array.length > 0 && !Double.isNaN(array[array.length-1]): - "Input array length should be > 0 and the first item should not be NaN"; + "Input array length should be > 0 and the last item should not be NaN"; if (array.length == 1){ return; }