apache · smurching · Oct 4, 2017 · Oct 4, 2017 · Oct 4, 2017 · Oct 4, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -276,14 +276,10 @@ private[tree] class LearningNode(
       new InternalNode(stats.impurityCalculator.predict, stats.impurity, stats.gain,
         leftChild.get.toNode, rightChild.get.toNode, split.get, stats.impurityCalculator)
     } else {
-      if (stats.valid) {
-        new LeafNode(stats.impurityCalculator.predict, stats.impurity,
-          stats.impurityCalculator)
-      } else {
-        // Here we want to keep same behavior with the old mllib.DecisionTreeModel
-        new LeafNode(stats.impurityCalculator.predict, -1.0, stats.impurityCalculator)
-      }
-
+      assert(stats != null, "Unknown error during Decision Tree learning. Could not convert " +
+        "LearningNode to Node")
+      new LeafNode(stats.impurityCalculator.predict, stats.impurity,
+        stats.impurityCalculator)
     }
   }
 
@@ -334,7 +330,7 @@ private[tree] object LearningNode {
       id: Int,
       isLeaf: Boolean,
       stats: ImpurityStats): LearningNode = {
-    new LearningNode(id, None, None, None, false, stats)
+    new LearningNode(id, None, None, None, isLeaf, stats)
   }
 
   /** Create an empty node with the given node index.  Values must be set later on. */
@@ -404,4 +400,5 @@ private[tree] object LearningNode {
     tmpNode
   }
 
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/AggUpdateUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/AggUpdateUtils.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import org.apache.spark.ml.tree.Split
+
+/**
+ * Helpers for updating DTStatsAggregators during collection of sufficient stats for tree training.
+ */
+private[impl] object AggUpdateUtils {
+
+  /**
+   * Updates the parent node stats of the passed-in impurity aggregator with the labels
+   * corresponding to the feature values at indices [from, to).
+   * @param indices Array of row indices for feature values; indices(i) = row index of the ith
+   *                feature value
+   */
+  private[impl] def updateParentImpurity(
+      statsAggregator: DTStatsAggregator,
+      indices: Array[Int],
+      from: Int,
+      to: Int,
+      instanceWeights: Array[Double],
+      labels: Array[Double]): Unit = {
+    from.until(to).foreach { idx =>
+      val rowIndex = indices(idx)
+      val label = labels(rowIndex)
+      statsAggregator.updateParent(label, instanceWeights(rowIndex))
+    }
+  }
+
+  /**
+   * Update aggregator for an (unordered feature, label) pair
+   * @param featureSplits Array of splits for the current feature
+   */
+  private[impl] def updateUnorderedFeature(
+      agg: DTStatsAggregator,
+      featureValue: Int,
+      label: Double,
+      featureIndex: Int,
+      featureIndexIdx: Int,
+      featureSplits: Array[Split],
+      instanceWeight: Double): Unit = {
+    val leftNodeFeatureOffset = agg.getFeatureOffset(featureIndexIdx)
+    // Each unordered split has a corresponding bin for impurity stats of data points that fall
+    // onto the left side of the split. For each unordered split, update left-side bin if applicable
+    // for the current data point.
+    val numSplits = agg.metadata.numSplits(featureIndex)
+    var splitIndex = 0
+    while (splitIndex < numSplits) {
+      if (featureSplits(splitIndex).shouldGoLeft(featureValue, featureSplits)) {
+        agg.featureUpdate(leftNodeFeatureOffset, splitIndex, label, instanceWeight)
+      }
+      splitIndex += 1
+    }
+  }
+
+  /** Update aggregator for an (ordered feature, label) pair */
+  private[impl] def updateOrderedFeature(
+      agg: DTStatsAggregator,
+      featureValue: Int,
+      label: Double,
+      featureIndex: Int,
+      featureIndexIdx: Int,
+      instanceWeight: Double): Unit = {
+    // The bin index of an ordered feature is just the feature value itself
+    val binIndex = featureValue
+    agg.update(featureIndexIdx, binIndex, label, instanceWeight)
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/FeatureColumn.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/FeatureColumn.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import org.apache.spark.util.collection.BitSet
+
+/**
+ * Stores values for a single training data column (a single continuous or categorical feature).
+ *
+ * Values are currently stored in a dense representation only.
+ * TODO: Support sparse storage (to optimize deeper levels of the tree), and maybe compressed
+ *       storage (to optimize upper levels of the tree).
+ *
+ * TODO: Sort feature values to support more complicated splitting logic (e.g. considering every
+ *       possible continuous split instead of discretizing continuous features).
+ *
+ * TODO: Consider sorting feature values; the only changed required would be to
+ * sort values at construction-time. Sorting might improve locality during stats
+ * aggregation (we'd frequently update the same O(statsSize) array for a (feature, bin),
+ * instead of frequently updating for the same feature).
+ *
+ */
+private[impl] class FeatureColumn(
+    val featureIndex: Int,
+    val values: Array[Int])
+  extends Serializable {
+
+  /** For debugging */
+  override def toString: String = {
+    "  FeatureVector(" +
+      s"    featureIndex: $featureIndex,\n" +
+      s"    values: ${values.mkString(", ")},\n" +
+      "  )"
+  }
+
+  def deepCopy(): FeatureColumn = new FeatureColumn(featureIndex, values.clone())
+
+  override def equals(other: Any): Boolean = {
+    other match {
+      case o: FeatureColumn =>
+        featureIndex == o.featureIndex && values.sameElements(o.values)
+      case _ => false
+    }
+  }
+
+  override def hashCode: Int = {
+    com.google.common.base.Objects.hashCode(
+      featureIndex: java.lang.Integer,
+      values)
+  }
+
+  /**
+   * Reorders the subset of feature values at indices [from, to) in the passed-in column
+   * according to the split information encoded in instanceBitVector (feature values for rows
+   * that split left appear before feature values for rows that split right).
+   *
+   * @param numLeftRows Number of rows on the left side of the split
+   * @param tempVals Destination buffer for reordered feature values
+   * @param instanceBitVector instanceBitVector(i) = true if the row for the (from + i)th feature
+   *                          value splits right, false otherwise
+   */
+  private[ml] def updateForSplit(
+      from: Int,
+      to: Int,
+      numLeftRows: Int,
+      tempVals: Array[Int],
+      instanceBitVector: BitSet): Unit = {
+    LocalDecisionTreeUtils.updateArrayForSplit(values, from, to, numLeftRows, tempVals,
+      instanceBitVector)
+  }
+}
+
+private[impl] object FeatureColumn {
+  /**
+   * Store column values sorted by decision tree node (i.e. all column values for a node occur
+   * in a contiguous subarray).
+   */
+  private[impl] def apply(featureIndex: Int, values: Array[Int]) = {
+    new FeatureColumn(featureIndex, values)
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/ImpurityUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/ImpurityUtils.scala
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import org.apache.spark.mllib.tree.impurity._
+import org.apache.spark.mllib.tree.model.ImpurityStats
+
+/** Helper methods for impurity-related calculations during node split decisions. */
+private[impl] object ImpurityUtils {
+
+  /**
+   * Get impurity calculator containing statistics for all labels for rows corresponding to
+   * feature values in [from, to).
+   * @param indices indices(i) = row index corresponding to ith feature value
+   */
+  private[impl] def getParentImpurityCalculator(
+      metadata: DecisionTreeMetadata,
+      indices: Array[Int],
+      from: Int,
+      to: Int,
+      instanceWeights: Array[Double],
+      labels: Array[Double]): ImpurityCalculator = {
+    // Compute sufficient stats (e.g. label counts) for all data at the current node,
+    // store result in currNodeStatsAgg.parentStats so that we can share it across
+    // all features for the current node
+    val currNodeStatsAgg = new DTStatsAggregator(metadata, featureSubset = None)
+    AggUpdateUtils.updateParentImpurity(currNodeStatsAgg, indices, from, to,
+      instanceWeights, labels)
+    currNodeStatsAgg.getParentImpurityCalculator()
+  }
+
+  /**
+   * Calculate the impurity statistics for a given (feature, split) based upon left/right
+   * aggregates.
+   *
+   * @param parentCalc Optional: an ImpurityCalculator containing the impurity stats
+   *                                 of the node currently being split.
+   * @param leftImpurityCalculator left node aggregates for this (feature, split)
+   * @param rightImpurityCalculator right node aggregate for this (feature, split)
+   * @param metadata learning and dataset metadata for DecisionTree
+   * @return Impurity statistics for this (feature, split)
+   */
+  private[impl] def calculateImpurityStats(
+      parentCalc: Option[ImpurityCalculator],
+      leftImpurityCalculator: ImpurityCalculator,
+      rightImpurityCalculator: ImpurityCalculator,
+      metadata: DecisionTreeMetadata): ImpurityStats = {
+
+    val parentImpurityCalculator
+      = parentCalc.getOrElse(leftImpurityCalculator.copy.add(rightImpurityCalculator))
+    val impurity: Double = parentImpurityCalculator.calculate()
+
+    val leftCount = leftImpurityCalculator.count
+    val rightCount = rightImpurityCalculator.count
+
+    val totalCount = leftCount + rightCount
+
+    // If left child or right child doesn't satisfy minimum instances per node,
+    // then this split is invalid, return invalid information gain stats.
+    if ((leftCount < metadata.minInstancesPerNode) ||
+      (rightCount < metadata.minInstancesPerNode)) {
+      return ImpurityStats.getInvalidImpurityStats(parentImpurityCalculator)
+    }
+
+    val leftImpurity = leftImpurityCalculator.calculate() // Note: This equals 0 if count = 0
+    val rightImpurity = rightImpurityCalculator.calculate()
+
+    val leftWeight = leftCount / totalCount.toDouble
+    val rightWeight = rightCount / totalCount.toDouble
+
+    val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
+    // If information gain doesn't satisfy minimum information gain,
+    // then this split is invalid, return invalid information gain stats.
+    if (gain < metadata.minInfoGain) {
+      return ImpurityStats.getInvalidImpurityStats(parentImpurityCalculator)
+    }
+
+    // If information gain is non-positive but doesn't violate the minimum info gain constraint,
+    // return a stats object with correct values but valid = false to indicate that we should not
+    // split.
+    if (gain <= 0) {
+      return new ImpurityStats(gain, impurity, parentImpurityCalculator, leftImpurityCalculator,
+        rightImpurityCalculator, valid = false)
+    }
+
+
+    new ImpurityStats(gain, impurity, parentImpurityCalculator,
+      leftImpurityCalculator, rightImpurityCalculator)
+  }
+
+  /**
+   * Given an impurity aggregator containing label statistics for a given (node, feature, bin),
+   * returns the corresponding "centroid", used to order bins while computing best splits.
+   *
+   * @param metadata learning and dataset metadata for DecisionTree
+   */
+  private[impl] def getCentroid(
+      metadata: DecisionTreeMetadata,
+      binStats: ImpurityCalculator): Double = {
+
+    if (binStats.count != 0) {
+      if (metadata.isMulticlass) {
+        // multiclass classification
+        // For categorical features in multiclass classification,
+        // the bins are ordered by the impurity of their corresponding labels.
+        binStats.calculate()
+      } else if (metadata.isClassification) {
+        // binary classification
+        // For categorical features in binary classification,
+        // the bins are ordered by the count of class 1.
+        binStats.stats(1)
+      } else {
+        // regression
+        // For categorical features in regression and binary classification,
+        // the bins are ordered by the prediction.
+        binStats.predict
+      }
+    } else {
+      Double.MaxValue
+    }
+  }
+}