Scala 2.13 support. (#9099)

1. Updated the test logic 2. Added smoke tests for Spark examples. 3. Added integration tests for Spark with Scala 2.13
dmlc · May 27, 2023 · a01df10 · a01df10
1 parent 8c174ef
commit a01df10
Show file tree

Hide file tree

Showing 24 changed files with 326 additions and 161 deletions.
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
@@ -75,3 +75,13 @@ jobs:
       if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
       env:
         RABIT_MOCK: ON
+
+
+    - name: Build and Test XGBoost4J with scala 2.13
+      run: |
+        rm -rfv build/
+        cd jvm-packages
+        mvn -B clean install test -Pdefault,scala-2.13
+      if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
+      env:
+        RABIT_MOCK: ON
diff --git a/jvm-packages/.gitignore b/jvm-packages/.gitignore
@@ -1,2 +1,4 @@
 tracker.py
 build.sh
+xgboost4j-tester/pom.xml
+xgboost4j-tester/iris.csv
diff --git a/jvm-packages/README.md b/jvm-packages/README.md
@@ -36,6 +36,19 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
     <version>latest_version_num</version>
 </dependency>
 ```
+or 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+```
 
 <b>sbt</b>
 ```sbt
@@ -47,7 +60,6 @@ libraryDependencies ++= Seq(
 
 For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
 
-To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ### Access SNAPSHOT version
 
@@ -85,6 +97,19 @@ Then add XGBoost4J as a dependency:
     <version>latest_version_num-SNAPSHOT</version>
 </dependency>
 ```
+or with scala 2.13 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+```
 
 <b>sbt</b>
 ```sbt
@@ -96,7 +121,9 @@ libraryDependencies ++= Seq(
 
 For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
 
+### GPU algorithm
 To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
+Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12.
 
 ## Examples
 

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
@@ -5,7 +5,7 @@
     <modelVersion>4.0.0</modelVersion>
 
     <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm_2.12</artifactId>
+    <artifactId>xgboost-jvm</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
@@ -34,6 +34,7 @@
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.17.1</flink.version>
+        <junit.version>4.13.2</junit.version>
         <spark.version>3.4.0</spark.version>
         <spark.version.gpu>3.3.2</spark.version.gpu>
         <scala.version>2.12.17</scala.version>
@@ -45,7 +46,9 @@
         <cudf.version>23.04.0</cudf.version>
         <spark.rapids.version>23.04.1</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
-    </properties>
+        <scalatest.version>3.2.16</scalatest.version>
+        <scala-collection-compat.version>2.9.0</scala-collection-compat.version>
+      </properties>
     <repositories>
         <repository>
             <id>central_maven</id>
@@ -71,6 +74,14 @@
             </modules>
         </profile>
 
+        <profile>
+            <id>scala-2.13</id>
+            <properties>
+                <scala.binary.version>2.13</scala.binary.version>
+                <scala.version>2.13.10</scala.version>
+            </properties>
+        </profile>
+
         <!-- gpu profile with both cpu and gpu test suites -->
         <profile>
             <id>gpu</id>
@@ -467,6 +478,7 @@
         </plugins>
     </reporting>
     <dependencies>
+
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
@@ -483,6 +495,11 @@
             <artifactId>scala-library</artifactId>
             <version>${scala.version}</version>
         </dependency>
+        <dependency>
+          <groupId>org.scala-lang.modules</groupId>
+          <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
+          <version>${scala-collection-compat.version}</version>
+        </dependency>
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
@@ -491,13 +508,13 @@
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.2.16</version>
+            <version>${scalatest.version}</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.scalactic</groupId>
             <artifactId>scalactic_${scala.binary.version}</artifactId>
-            <version>3.2.15</version>
+            <version>${scalatest.version}</version>
             <scope>test</scope>
         </dependency>
     </dependencies>

diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-example_2.12</artifactId>
+    <name>xgboost4j-example</name>
+    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>

diff --git a/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala b/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
@@ -73,12 +73,13 @@ object DistTrainWithFlink {
           .map(_.f1.f0)
           .returns(testDataTypeHint)
 
-    val paramMap = mapAsJavaMap(Map(
-      ("eta", "0.1".asInstanceOf[AnyRef]),
-      ("max_depth", "2"),
-      ("objective", "binary:logistic"),
-      ("verbosity", "1")
-    ))
+    val paramMap = Map(
+        ("eta", "0.1".asInstanceOf[AnyRef]),
+        ("max_depth", "2"),
+        ("objective", "binary:logistic"),
+        ("verbosity", "1")
+      )
+      .asJava
 
     // number of iterations
     val round = 2

diff --git a/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/...t4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
@@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.tuning._
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types._
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
+import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
 
@@ -50,6 +49,13 @@ object SparkMLlibPipeline {
       .appName("XGBoost4J-Spark Pipeline Example")
       .getOrCreate()
 
+    run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
+      .show(false)
+  }
+  private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
+                         pipelineModelPath: String, treeMethod: String,
+                         numWorkers: Int): DataFrame = {
+
     // Load dataset
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
@@ -90,11 +96,11 @@ object SparkMLlibPipeline {
     val labelConverter = new IndexToString()
       .setInputCol("prediction")
       .setOutputCol("realLabel")
-      .setLabels(labelIndexer.labels)
+      .setLabels(labelIndexer.labelsArray(0))
 
     val pipeline = new Pipeline()
       .setStages(Array(assembler, labelIndexer, booster, labelConverter))
-    val model = pipeline.fit(training)
+    val model: PipelineModel = pipeline.fit(training)
 
     // Batch prediction
     val prediction = model.transform(test)
@@ -136,6 +142,6 @@ object SparkMLlibPipeline {
 
     // Load a saved model and serving
     val model2 = PipelineModel.load(pipelineModelPath)
-    model2.transform(test).show(false)
+    model2.transform(test)
   }
 }
diff --git a/...gboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/...gboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
@@ -17,9 +17,8 @@
 package ml.dmlc.xgboost4j.scala.example.spark
 
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
-
 import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@@ -38,6 +37,12 @@ object SparkTraining {
 
     val spark = SparkSession.builder().getOrCreate()
     val inputPath = args(0)
+    val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
+    results.show()
+  }
+
+private[spark] def run(spark: SparkSession, inputPath: String,
+                       treeMethod: String, numWorkers: Int): DataFrame =  {
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
       StructField("sepal width", DoubleType, true),
@@ -81,7 +86,6 @@ object SparkTraining {
       setFeaturesCol("features").
       setLabelCol("classIndex")
     val xgbClassificationModel = xgbClassifier.fit(train)
-    val results = xgbClassificationModel.transform(test)
-    results.show()
+    xgbClassificationModel.transform(test)
   }
 }
diff --git a/...st4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala b/...st4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
@@ -0,0 +1,123 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.spark
+
+import org.apache.spark.sql.SparkSession
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.slf4j.LoggerFactory
+
+import java.io.File
+import java.nio.file.{Files, StandardOpenOption}
+import scala.jdk.CollectionConverters._
+import scala.util.{Random, Try}
+
+class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest])
+  private val random = new Random(42)
+  protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4)
+
+  private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath
+  private var spark: SparkSession = _
+
+  override def beforeAll(): Unit = {
+
+    def generateLine(i: Int): String = {
+      val getIrisName = (int: Int) => {
+        int % 3 match {
+          case 0 => "Iris-versicolor"
+          case 1 => "Iris-virginica"
+          case 2 => "Iris-setosa"
+        }
+      }
+      val generateValue = () => Math.abs(random.nextInt(99) * 0.1)
+      val sepalLength = generateValue()
+      val sepalWidth = generateValue()
+      val petalLength = generateValue()
+      val petalWidth = generateValue()
+      val irisName = getIrisName(Math.abs(random.nextInt()) + i)
+      s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName"
+    }
+
+    if (spark == null) {
+       spark = SparkSession
+        .builder()
+        .appName("XGBoost4J-Spark Pipeline Example")
+        .master(s"local[${numWorkers}]")
+        .config("spark.ui.enabled", value = false)
+        .config("spark.driver.memory", "512m")
+        .config("spark.barrier.sync.timeout", 10)
+        .config("spark.task.cpus", 1)
+        .getOrCreate()
+      spark.sparkContext.setLogLevel("ERROR")
+    }
+    val data = (0 until 150)
+      .map(i => generateLine(i))
+      .toList
+      .asJava
+    Files.write(pathToTestDataset,
+      data,
+      StandardOpenOption.CREATE,
+      StandardOpenOption.WRITE,
+      StandardOpenOption.TRUNCATE_EXISTING)
+    logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}")
+
+  }
+
+  override def afterAll(): Unit = {
+    if (spark != null) {
+      spark.stop()
+      cleanExternalCache(spark.sparkContext.appName)
+      spark = null
+    }
+
+    Try(Files.deleteIfExists(pathToTestDataset))
+      .recover {
+        case e =>
+          logger.warn(
+            s"Could not delete temporary file $pathToTestDataset. Please, remove it manually",
+            e
+          )
+          true
+    }
+  }
+
+  private def cleanExternalCache(prefix: String): Unit = {
+    val dir = new File(".")
+    for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
+      file.delete()
+    }
+  }
+
+  test("Smoke test for SparkMLlibPipeline example") {
+    SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
+      "target/pipeline-model", "auto", 2)
+  }
+
+  test("Smoke test for SparkTraining example") {
+    val spark = SparkSession
+      .builder()
+      .appName("XGBoost4J-Spark Pipeline Example")
+      .master(s"local[${numWorkers}]")
+      .config("spark.ui.enabled", value = false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .getOrCreate()
+
+    SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
+  }
+}