Skip to content

Commit ba24c52

Browse files
committed
move feature size validating tests into a separate suite
1 parent e645613 commit ba24c52

File tree

2 files changed

+72
-49
lines changed

2 files changed

+72
-49
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
Copyright (c) 2014 by Contributors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package ml.dmlc.xgboost4j.scala.spark
18+
19+
import ml.dmlc.xgboost4j.java.XGBoostError
20+
import org.apache.spark.Partitioner
21+
import org.apache.spark.ml.feature.VectorAssembler
22+
import org.apache.spark.sql.SparkSession
23+
import org.scalatest.FunSuite
24+
25+
import scala.util.Random
26+
27+
class FeatureSizeValidatingSuite extends FunSuite with PerTest {
28+
29+
test("transform throwing exception if feature size of dataset is different with model's") {
30+
val modelPath = getClass.getResource("/model/0.82/model").getPath
31+
val model = XGBoostClassificationModel.read.load(modelPath)
32+
val r = new Random(0)
33+
// 0.82/model was trained with 251 features. and transform will throw exception
34+
// if feature size of data is not equal to 251
35+
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
36+
toDF("feature", "label")
37+
val assembler = new VectorAssembler()
38+
.setInputCols(df.columns.filter(!_.contains("label")))
39+
.setOutputCol("features")
40+
val thrown = intercept[Exception] {
41+
model.transform(assembler.transform(df)).show()
42+
}
43+
assert(thrown.getMessage.contains(
44+
"Number of columns does not match number of features in booster"))
45+
}
46+
47+
test("train throwing exception if feature size of dataset is different on distributed train") {
48+
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
49+
"objective" -> "binary:logistic",
50+
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
51+
import DataUtils._
52+
val sparkSession = SparkSession.builder().getOrCreate()
53+
import sparkSession.implicits._
54+
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
55+
.map(lp => (lp.label, lp)).partitionBy(
56+
new Partitioner {
57+
override def numPartitions: Int = 2
58+
59+
override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
60+
}
61+
).map(_._2).zipWithIndex().map {
62+
case (lp, id) =>
63+
(id, lp.label, lp.features)
64+
}.toDF("id", "label", "features")
65+
val xgb = new XGBoostClassifier(paramMap)
66+
intercept[XGBoostError] {
67+
xgb.fit(repartitioned)
68+
}
69+
}
70+
71+
}

jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,12 @@
1616

1717
package ml.dmlc.xgboost4j.scala.spark
1818

19-
import java.nio.file.Files
20-
21-
import ml.dmlc.xgboost4j.java.XGBoostError
22-
2319
import scala.util.Random
2420
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
2521
import ml.dmlc.xgboost4j.scala.DMatrix
26-
import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
27-
import org.apache.hadoop.fs.{FileSystem, Path}
28-
import org.apache.spark.{Partitioner, TaskContext}
22+
import org.apache.spark.{TaskContext}
2923
import org.scalatest.FunSuite
3024
import org.apache.spark.ml.feature.VectorAssembler
31-
import org.apache.spark.sql.SparkSession
3225
import org.apache.spark.sql.functions.lit
3326

3427
class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
@@ -374,45 +367,4 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
374367
df2.collect()
375368
}
376369

377-
test("transform throwing exception when feature size of dataset is different with model's") {
378-
val modelPath = getClass.getResource("/model/0.82/model").getPath
379-
val model = XGBoostClassificationModel.read.load(modelPath)
380-
val r = new Random(0)
381-
// 0.82/model was trained with 251 features. and transform will throw exception
382-
// if feature size of data is not equal to 251
383-
val df = ss.createDataFrame(Seq.fill(100)(r.nextInt(2)).map(i => (i, i))).
384-
toDF("feature", "label")
385-
val assembler = new VectorAssembler()
386-
.setInputCols(df.columns.filter(!_.contains("label")))
387-
.setOutputCol("features")
388-
val thrown = intercept[Exception] {
389-
model.transform(assembler.transform(df)).show()
390-
}
391-
assert(thrown.getMessage.contains(
392-
"Number of columns does not match number of features in booster"))
393-
}
394-
395-
test("train throwing exception when feature size of dataset is different on distributed train") {
396-
val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
397-
"objective" -> "binary:logistic",
398-
"num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
399-
import DataUtils._
400-
val sparkSession = SparkSession.builder().getOrCreate()
401-
import sparkSession.implicits._
402-
val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
403-
.map(lp => (lp.label, lp)).partitionBy(
404-
new Partitioner {
405-
override def numPartitions: Int = 2
406-
407-
override def getPartition(key: Any): Int = key.asInstanceOf[Float].toInt
408-
}
409-
).map(_._2).zipWithIndex().map {
410-
case (lp, id) =>
411-
(id, lp.label, lp.features)
412-
}.toDF("id", "label", "features")
413-
val xgb = new XGBoostClassifier(paramMap)
414-
intercept[XGBoostError] {
415-
xgb.fit(repartitioned)
416-
}
417-
}
418370
}

0 commit comments

Comments
 (0)