h2oai · syzonyuliia · Oct 12, 2023 · Oct 22, 2023 · Oct 22, 2023 · Oct 22, 2023
diff --git a/h2o-algos/src/main/java/hex/schemas/DTV3.java b/h2o-algos/src/main/java/hex/schemas/DTV3.java
@@ -19,6 +19,7 @@ public static final class DTParametersV3 extends ModelParametersSchemaV3<DTModel
                 "categorical_encoding",
                 "response_column",
                 "seed",
+                "distribution",
                 // SDT specific
                 "max_depth",
                 "min_rows"

diff --git a/h2o-algos/src/main/java/hex/tree/dt/CategoricalSplittingRule.java b/h2o-algos/src/main/java/hex/tree/dt/CategoricalSplittingRule.java
@@ -31,7 +31,7 @@ public String toString() {
   // true for left, false for right
   public boolean routeSample(double[] sample) {
     int category = (int) sample[_featureIndex];
-    assert category < _mask.length; // todo: new values in the train set are not supported yet - will be treated as missing values
+    assert category < _mask.length; // new values in the train set are not supported yet - will be treated as missing values
     return _mask[category];
   }
 }
diff --git a/h2o-algos/src/main/java/hex/tree/dt/CompressedDT.java b/h2o-algos/src/main/java/hex/tree/dt/CompressedDT.java
@@ -35,18 +35,18 @@ public CompressedDT(AbstractCompressedNode[] nodes, int leavesCount) {
      */
     public DTPrediction predictRowStartingFromNode(final double[] rowValues, final int actualNodeIndex, String ruleExplanation) {
         boolean isALeaf = _nodes[actualNodeIndex] instanceof CompressedLeaf;
-        // first value 1 means that the node is list, return prediction for the list
+        // first value 1 means that the node is a leaf, return prediction for the leaf
         if (isALeaf) {
             double decisionValue = ((CompressedLeaf) _nodes[actualNodeIndex]).getDecisionValue();
-            double probability = ((CompressedLeaf) _nodes[actualNodeIndex]).getProbabilities();
-            return new DTPrediction((int) decisionValue, probability, ruleExplanation + " -> (" 
-                    + decisionValue + ", probabilities: " + probability + ", " + (1 - probability) + ")");
+            double[] probabilities = ((CompressedLeaf) _nodes[actualNodeIndex]).getProbabilities();
+            return new DTPrediction((int) decisionValue, probabilities, 
+                    ruleExplanation + " -> " + _nodes[actualNodeIndex].toString());
         }
         if (!ruleExplanation.isEmpty()) {
             ruleExplanation += " and ";
         }
         AbstractSplittingRule splittingRule = ((CompressedNode) _nodes[actualNodeIndex]).getSplittingRule();
-        // splitting rule is true - left, false - right
+        // splitting rule is: true - left, false - right
         if(splittingRule.routeSample(rowValues)) {
             return predictRowStartingFromNode(rowValues, 2 * actualNodeIndex + 1, 
                     ruleExplanation + splittingRule.toString());
@@ -65,7 +65,7 @@ public int extractRulesStartingWithNode(int nodeIndex, String actualRule, int ne
         if (_nodes[nodeIndex] instanceof CompressedLeaf) {
             // if node is a leaf, add the rule to the list of rules at index given by the nextFreeSpot parameter
             _listOfRules[nextFreeSpot] = actualRule + " -> (" + ((CompressedLeaf) _nodes[nodeIndex]).getDecisionValue()
-                    + ", " + ((CompressedLeaf) _nodes[nodeIndex]).getProbabilities() + ")";
+                    + ", " + Arrays.toString(((CompressedLeaf) _nodes[nodeIndex]).getProbabilities()) + ")";
             // move nextFreeSpot to the next index and return it to be used for other branches
             nextFreeSpot++;
             return nextFreeSpot;

diff --git a/h2o-algos/src/main/java/hex/tree/dt/CompressedLeaf.java b/h2o-algos/src/main/java/hex/tree/dt/CompressedLeaf.java
@@ -1,27 +1,32 @@
 package hex.tree.dt;
 
 
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
 public class CompressedLeaf extends AbstractCompressedNode {
     private final double _decisionValue;
-    private final double _probability;
+    private final double[] _probabilities;
 
 
-    public CompressedLeaf(double decisionValue, double probabilities) {
+    public CompressedLeaf(double decisionValue, double[] probabilities) {
         super();
         _decisionValue = decisionValue;
-        _probability = probabilities;
+        _probabilities = probabilities;
     }
 
     public double getDecisionValue() {
         return _decisionValue;
     }
 
-    public double getProbabilities() {
-        return _probability;
+    public double[] getProbabilities() {
+        return _probabilities;
     }
 
     @Override
     public String toString() {
-        return "(leaf: " + _decisionValue + ", " + _probability + ", " + (1- _probability) + ")";
+        return "(leaf: " + _decisionValue + "; " 
+                + Arrays.stream(_probabilities).mapToObj(Double::toString)
+                .collect(Collectors.joining(", ")) + ")";
     }
 }
diff --git a/h2o-algos/src/main/java/hex/tree/dt/DT.java b/h2o-algos/src/main/java/hex/tree/dt/DT.java
@@ -8,7 +8,6 @@
 import hex.tree.dt.binning.Histogram;
 import hex.tree.dt.mrtasks.GetClassCountsMRTask;
 import hex.tree.dt.mrtasks.ScoreDTTask;
-import org.apache.commons.math3.util.Precision;
 import org.apache.log4j.Logger;
 import water.DKV;
 import water.exceptions.H2OModelBuilderIllegalArgumentException;
@@ -19,7 +18,7 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-import static hex.tree.dt.binning.SplitStatistics.entropyBinarySplit;
+import static hex.tree.dt.binning.SplitStatistics.entropyMulticlass;
 
 /**
  * Decision Tree
@@ -49,8 +48,6 @@ public class DT extends ModelBuilder<DTModel, DTModel.DTParameters, DTModel.DTOu
 
     private DTModel _model;
     transient Random _rand;
-
-    //    private final static int LIMIT_NUM_ROWS_FOR_SPLIT = 2; // todo - make a parameter with default value
     public final static double EPSILON = 1e-6;
     public final static double MIN_IMPROVEMENT = 1e-6;
     private static final Logger LOG = Logger.getLogger(DT.class);
@@ -108,10 +105,9 @@ private AbstractSplittingRule findBestSplit(Histogram histogram) {
 
     private AbstractSplittingRule findBestSplitForFeature(Histogram histogram, int featureIndex) {
         return (_train.vec(featureIndex).isNumeric()
-                ? histogram.calculateSplitStatisticsForNumericFeature(featureIndex)
-                : histogram.calculateSplitStatisticsForCategoricalFeature(featureIndex))
+                ? histogram.calculateSplitStatisticsForNumericFeature(featureIndex, _nclass)
+                : histogram.calculateSplitStatisticsForCategoricalFeature(featureIndex, _nclass))
                 .stream()
-                // todo - consider setting min count of samples in bin instead of filtering splits
                 .filter(binStatistics -> ((binStatistics._leftCount >= _min_rows)
                         && (binStatistics._rightCount >= _min_rows)))
                 .peek(binStatistics -> Log.debug("split: " + binStatistics._splittingRule + ", counts: "
@@ -128,7 +124,7 @@ private AbstractSplittingRule findBestSplitForFeature(Histogram histogram, int f
 
 
     private static double calculateCriterionOfSplit(SplitStatistics binStatistics) {
-        return binStatistics.binaryEntropy();
+        return binStatistics.splitEntropy();
     }
 
     /**
@@ -139,7 +135,7 @@ private static double calculateCriterionOfSplit(SplitStatistics binStatistics) {
      */
     private int selectDecisionValue(int[] countsByClass) {
         if (_nclass == 1) {
-            return countsByClass[0];
+            return 0;
         }
         int currentMaxClass = 0;
         int currentMax = countsByClass[currentMaxClass];
@@ -155,10 +151,10 @@ private int selectDecisionValue(int[] countsByClass) {
     /**
      * Calculates probabilities of each class for a leaf.
      *
-     * @param countsByClass counts of 0 and 1 in a leaf
-     * @return probabilities of 0 or 1
+     * @param countsByClass counts of each class in a leaf
+     * @return probabilities of each class
      */
-    private double[] calculateProbability(int[] countsByClass) {
+    private double[] calculateProbabilities(int[] countsByClass) {
         int samplesCount = Arrays.stream(countsByClass).sum();
         return Arrays.stream(countsByClass).asDoubleStream().map(n -> n / samplesCount).toArray();
     }
@@ -171,7 +167,7 @@ private double[] calculateProbability(int[] countsByClass) {
      * @param nodeIndex     node index
      */
     public void makeLeafFromNode(int[] countsByClass, int nodeIndex) {
-        _tree[nodeIndex] = new CompressedLeaf(selectDecisionValue(countsByClass), calculateProbability(countsByClass)[0]);
+        _tree[nodeIndex] = new CompressedLeaf(selectDecisionValue(countsByClass), calculateProbabilities(countsByClass));
         _leavesCount++;
         // nothing to return, node is modified inplace
     }
@@ -200,16 +196,19 @@ public void buildNextNode(Queue<DataFeaturesLimits> limitsQueue, int nodeIndex)
         // [count0, count1, ...]
         int[] countsByClass = countClasses(actualLimits);
         if (nodeIndex == 0) {
-            Log.info("Classes counts in dataset: 0 - " + countsByClass[0] + ", 1 - " + countsByClass[1]);
+            Log.info(IntStream.range(0, countsByClass.length)
+                    .mapToObj(i -> i + " - " + countsByClass[i])
+                    .collect(Collectors.joining(", ", "Classes counts in dataset: ", "")));
         }
         // compute node depth
         int nodeDepth = (int) Math.floor(MathUtils.log2(nodeIndex + 1));
-        // stop building from this node, the node will be a leaf
-        if ((nodeDepth >= _parms._max_depth)
-                || (countsByClass[0] <= _min_rows)
-                || (countsByClass[1] <= _min_rows)
-//                || zeroRatio > 0.999 || zeroRatio < 0.001
-        ) {
+        // stop building from this node, the node will be a leaf if: 
+        // - max depth is reached 
+        // - there is only one non-zero count in the countsByClass 
+        // - there are not enough data points in the node
+        if ((nodeDepth >= _parms._max_depth) 
+                || Arrays.stream(countsByClass).filter(c -> c > 0).count() < 2 
+                || Arrays.stream(countsByClass).sum() < _min_rows) {
             // add imaginary left and right children to imitate valid tree structure
             // left child
             limitsQueue.add(null);
@@ -219,10 +218,10 @@ public void buildNextNode(Queue<DataFeaturesLimits> limitsQueue, int nodeIndex)
             return;
         }
 
-        Histogram histogram = new Histogram(_train, actualLimits, BinningStrategy.EQUAL_WIDTH/*, minNumSamplesInBin - todo consider*/);
+        Histogram histogram = new Histogram(_train, actualLimits, BinningStrategy.EQUAL_WIDTH, _nclass);
 
         AbstractSplittingRule bestSplittingRule = findBestSplit(histogram);
-        double criterionForTheParentNode = entropyBinarySplit(1.0 * countsByClass[0] / (countsByClass[0] + countsByClass[1]));
+        double criterionForTheParentNode = entropyMulticlass(countsByClass, Arrays.stream(countsByClass).sum());
         // if no split could be found, make a list from current node
         // if the information gain is low, make a leaf from current node
         if (bestSplittingRule == null
@@ -291,9 +290,6 @@ private void dtChecks() {
             if (!_response.isCategorical()) {
                 error("_response", "Only categorical response is supported");
             }
-            if (!_response.isBinary()) {
-                error("_response", "Only binary response is supported");
-            }
         }
 
         @Override
@@ -365,7 +361,7 @@ public BuilderVisibility builderVisibility() {
     public ModelCategory[] can_build() {
         return new ModelCategory[]{
                 ModelCategory.Binomial,
-//                ModelCategory.Multinomial,
+                ModelCategory.Multinomial,
 //                                            ModelCategory.Ordinal,
 //                ModelCategory.Regression
         };

diff --git a/h2o-algos/src/main/java/hex/tree/dt/DTModel.java b/h2o-algos/src/main/java/hex/tree/dt/DTModel.java
@@ -4,7 +4,6 @@
 import org.apache.log4j.Logger;
 import water.*;
 
-import java.util.Arrays;
 
 public class DTModel extends Model<DTModel, DTModel.DTParameters, DTModel.DTOutput> {
 
@@ -36,10 +35,10 @@ protected double[] score0(double[] data, double[] preds) {
         // compute score for given point
         CompressedDT tree = DKV.getGet(_output._treeKey);
         DTPrediction prediction = tree.predictRowStartingFromNode(data, 0, "");
-        // for now, only pred. for class 0 is stored, will be improved later
         preds[0] = prediction.classPrediction;
-        preds[1] = prediction.probability;
-        preds[2] = 1 - prediction.probability;
+        for (int i = 0; i < prediction.probabilities.length; i++) {
+            preds[i + 1] =  prediction.probabilities[i];
+        }
 
         return preds;
     }

diff --git a/h2o-algos/src/main/java/hex/tree/dt/DTPrediction.java b/h2o-algos/src/main/java/hex/tree/dt/DTPrediction.java
@@ -2,12 +2,12 @@
 
 public class DTPrediction {
     public int classPrediction;
-    public double probability;
+    public double[] probabilities;
     public String ruleExplanation;
 
-    public DTPrediction(int classPrediction, double probability, String ruleExplanation) {
+    public DTPrediction(int classPrediction, double[] probabilities, String ruleExplanation) {
         this.classPrediction = classPrediction;
-        this.probability = probability;
+        this.probabilities = probabilities;
         this.ruleExplanation = ruleExplanation;
     }
 }
diff --git a/h2o-algos/src/main/java/hex/tree/dt/binning/AbstractBin.java b/h2o-algos/src/main/java/hex/tree/dt/binning/AbstractBin.java
@@ -5,11 +5,11 @@
  * Single bin holding limits (min excluded), count of samples and count of class 0.
  */
 public abstract class AbstractBin {
-    public int _count0;
+    public int[] _classesDistribution;
     public int _count;
 
-    public int getCount0() {
-        return _count0;
+    public int getClassCount(int i) {
+        return _classesDistribution[i];
     }
 
     public abstract AbstractBin clone();