Updated the xgboost4j-tester.

1. Updated the test logic 2. Added smoke tests for Spark examples. 3. Added integration tests for Spark with Scala 2.13
dmlc · May 26, 2023 · 376b576 · 376b576
1 parent e8d9867
commit 376b576
Show file tree

Hide file tree

Showing 10 changed files with 214 additions and 120 deletions.
diff --git a/jvm-packages/.gitignore b/jvm-packages/.gitignore
@@ -1,2 +1,4 @@
 tracker.py
 build.sh
+xgboost4j-tester/pom.xml
+xgboost4j-tester/iris.csv
diff --git a/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
@@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.tuning._
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types._
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
+import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
 
@@ -50,6 +49,13 @@ object SparkMLlibPipeline {
       .appName("XGBoost4J-Spark Pipeline Example")
       .getOrCreate()
 
+    run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
+      .show(false)
+  }
+  private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
+                         pipelineModelPath: String, treeMethod: String,
+                         numWorkers: Int): DataFrame = {
+
     // Load dataset
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
@@ -90,11 +96,11 @@ object SparkMLlibPipeline {
     val labelConverter = new IndexToString()
       .setInputCol("prediction")
       .setOutputCol("realLabel")
-      .setLabels(labelIndexer.labels)
+      .setLabels(labelIndexer.labelsArray(0))
 
     val pipeline = new Pipeline()
       .setStages(Array(assembler, labelIndexer, booster, labelConverter))
-    val model = pipeline.fit(training)
+    val model: PipelineModel = pipeline.fit(training)
 
     // Batch prediction
     val prediction = model.transform(test)
@@ -136,6 +142,6 @@ object SparkMLlibPipeline {
 
     // Load a saved model and serving
     val model2 = PipelineModel.load(pipelineModelPath)
-    model2.transform(test).show(false)
+    model2.transform(test)
   }
 }
diff --git a/...gboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/...gboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
@@ -17,9 +17,8 @@
 package ml.dmlc.xgboost4j.scala.example.spark
 
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
-
 import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@@ -38,6 +37,12 @@ object SparkTraining {
 
     val spark = SparkSession.builder().getOrCreate()
     val inputPath = args(0)
+    val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
+    results.show()
+  }
+
+private[spark] def run(spark: SparkSession, inputPath: String,
+                       treeMethod: String, numWorkers: Int): DataFrame =  {
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
       StructField("sepal width", DoubleType, true),
@@ -81,7 +86,6 @@ object SparkTraining {
       setFeaturesCol("features").
       setLabelCol("classIndex")
     val xgbClassificationModel = xgbClassifier.fit(train)
-    val results = xgbClassificationModel.transform(test)
-    results.show()
+    xgbClassificationModel.transform(test)
   }
 }
diff --git a/...st4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala b/...st4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
@@ -0,0 +1,123 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.spark
+
+import org.apache.spark.sql.SparkSession
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.slf4j.LoggerFactory
+
+import java.io.File
+import java.nio.file.{Files, StandardOpenOption}
+import scala.jdk.CollectionConverters._
+import scala.util.{Random, Try}
+
+class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest])
+  private val random = new Random(42)
+  protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4)
+
+  private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath
+  private var spark: SparkSession = _
+
+  override def beforeAll(): Unit = {
+
+    def generateLine(i: Int): String = {
+      val getIrisName = (int: Int) => {
+        int % 3 match {
+          case 0 => "Iris-versicolor"
+          case 1 => "Iris-virginica"
+          case 2 => "Iris-setosa"
+        }
+      }
+      val generateValue = () => Math.abs(random.nextInt(99) * 0.1)
+      val sepalLength = generateValue()
+      val sepalWidth = generateValue()
+      val petalLength = generateValue()
+      val petalWidth = generateValue()
+      val irisName = getIrisName(Math.abs(random.nextInt()) + i)
+      s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName"
+    }
+
+    if (spark == null) {
+       spark = SparkSession
+        .builder()
+        .appName("XGBoost4J-Spark Pipeline Example")
+        .master(s"local[${numWorkers}]")
+        .config("spark.ui.enabled", value = false)
+        .config("spark.driver.memory", "512m")
+        .config("spark.barrier.sync.timeout", 10)
+        .config("spark.task.cpus", 1)
+        .getOrCreate()
+      spark.sparkContext.setLogLevel("ERROR")
+    }
+    val data = (0 until 150)
+      .map(i => generateLine(i))
+      .toList
+      .asJava
+    Files.write(pathToTestDataset,
+      data,
+      StandardOpenOption.CREATE,
+      StandardOpenOption.WRITE,
+      StandardOpenOption.TRUNCATE_EXISTING)
+    logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}")
+
+  }
+
+  override def afterAll(): Unit = {
+    if (spark != null) {
+      spark.stop()
+      cleanExternalCache(spark.sparkContext.appName)
+      spark = null
+    }
+
+    Try(Files.deleteIfExists(pathToTestDataset))
+      .recover {
+        case e =>
+          logger.warn(
+            s"Could not delete temporary file $pathToTestDataset. Please, remove it manually",
+            e
+          )
+          true
+    }
+  }
+
+  private def cleanExternalCache(prefix: String): Unit = {
+    val dir = new File(".")
+    for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
+      file.delete()
+    }
+  }
+
+  test("Smoke test for SparkMLlibPipeline example") {
+    SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
+      "target/pipeline-model", "auto", 2)
+  }
+
+  test("Smoke test for SparkTraining example") {
+    val spark = SparkSession
+      .builder()
+      .appName("XGBoost4J-Spark Pipeline Example")
+      .master(s"local[${numWorkers}]")
+      .config("spark.ui.enabled", value = false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .getOrCreate()
+
+    SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
+  }
+}
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -8,7 +8,7 @@
   <modelVersion>4.0.0</modelVersion>
 
   <groupId>ml.dmlc</groupId>
-  <artifactId>xgboost4j-tester_${scala.binary.version}</artifactId>
+  <artifactId>xgboost4j-tester_{scala_binary_version}</artifactId>
   <version>1.0-SNAPSHOT</version>
 
   <name>xgboost4j-tester</name>
@@ -17,16 +17,19 @@
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <maven.compiler.source>{maven_compiler_source}</maven.compiler.source>
     <maven.compiler.target>{maven_compiler_target}</maven.compiler.target>
+    <junit.version>4.13.2</junit.version>
     <spark.version>{spark_version}</spark.version>
     <scala.version>{scala_version}</scala.version>
+    <scalatest.version>3.2.15</scalatest.version>
     <scala.binary.version>{scala_binary_version}</scala.binary.version>
+    <kryo.version>5.5.0</kryo.version>
   </properties>
 
   <dependencies>
-    <dependency>
+   <dependency>
       <groupId>com.esotericsoftware</groupId>
       <artifactId>kryo</artifactId>
-      <version>4.0.2</version>
+      <version>${{kryo.version}}</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -48,29 +51,12 @@
       <artifactId>commons-logging</artifactId>
       <version>1.2</version>
     </dependency>
-    <dependency>
-      <groupId>com.typesafe.akka</groupId>
-      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.6.20</version>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${{scala.binary.version}}</artifactId>
-      <version>3.0.8</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.scalactic</groupId>
-      <artifactId>scalactic_${{scala.binary.version}}</artifactId>
-      <version>3.2.15</version>
+      <version>${{scalatest.version}}</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-      <version>3.9</version>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${{scala.binary.version}}</artifactId>
@@ -92,7 +78,7 @@
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
-      <version>4.13.2</version>
+      <version>${{junit.version}}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -122,36 +108,9 @@
 
   <build>
     <plugins>
-      <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
-      <plugin>
-        <artifactId>maven-clean-plugin</artifactId>
-        <version>3.1.0</version>
-      </plugin>
-      <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
-      <plugin>
-        <artifactId>maven-resources-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.8.0</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-jar-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-install-plugin</artifactId>
-        <version>2.5.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <version>2.8.2</version>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-assembly-plugin</artifactId>
-        <version>2.4</version>
         <configuration>
           <descriptorRefs>
             <descriptorRef>jar-with-dependencies</descriptorRef>
@@ -171,22 +130,12 @@
           </execution>
         </executions>
       </plugin>
-      <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
-      <plugin>
-        <artifactId>maven-site-plugin</artifactId>
-        <version>3.7.1</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-project-info-reports-plugin</artifactId>
-        <version>3.0.0</version>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.22.1</version>
         <configuration>
           <dependenciesToScan>
-            <dependency>ml.dmlc:xgboost4j_2.12</dependency>
+            <dependency>ml.dmlc:xgboost4j_${{scala.binary.version}}</dependency>
           </dependenciesToScan>
         </configuration>
       </plugin>

diff --git a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh
@@ -4,11 +4,18 @@ set -euo pipefail
 
 source tests/buildkite/conftest.sh
 
-echo "--- Build XGBoost JVM packages"
+echo "--- Build XGBoost JVM packages scala 2.12"
 tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
   ${SPARK_VERSION}
 
+
+echo "--- Build XGBoost JVM packages scala 2.13"
+
+tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
+  ${SPARK_VERSION} "" "" "true"
+
 echo "--- Stash XGBoost4J JARs"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
@@ -25,7 +25,7 @@ set -x
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
 RAPIDS_VERSION=23.02
-SPARK_VERSION=3.1.1
+SPARK_VERSION=3.4.0
 JDK_VERSION=8
 
 if [[ -z ${BUILDKITE:-} ]]