From 58d53b7c4cf118cf7e107112b4dd82c12eb3cd1b Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Tue, 26 Apr 2022 11:23:49 +0800 Subject: [PATCH 1/2] [jvm-packages] model loading should be compatible with old models For the parameters not used, XGBoost should delete them, instead of keeping it, which will cause some issue for the old model. For example, the kill_spark_context_on_worker_failure is deleted, but when loading the model saved in 1.6.0, (in which has kill_spark_context_on_worker_failure parameter). XGBoost will throw an exception --- .../params/DefaultXGBoostParamsReader.scala | 21 +++++++++++---- .../data/.XGBoostClassificationModel.crc | Bin 0 -> 20 bytes .../model/data/XGBoostClassificationModel | Bin 0 -> 1479 bytes .../model/1.6.0/model/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../1.6.0/model/metadata/.part-00000.crc | Bin 0 -> 20 bytes .../model/1.6.0/model/metadata/_SUCCESS | 0 .../model/1.6.0/model/metadata/part-00000 | 1 + .../scala/spark/PersistenceSuite.scala | 25 +++++++++++++++++- 8 files changed, 41 insertions(+), 6 deletions(-) create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS create mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala index bb75bb342cb1..d7d4fca771c5 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 by Contributors + Copyright (c) 2014-2022 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,18 +16,22 @@ package ml.dmlc.xgboost4j.scala.spark.params +import ml.dmlc.xgboost4j.scala.spark +import org.apache.commons.logging.LogFactory import org.apache.hadoop.fs.Path import org.json4s.{DefaultFormats, JValue} import org.json4s.JsonAST.JObject import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.apache.spark.SparkContext -import org.apache.spark.ml.param.{Param, Params} +import org.apache.spark.ml.param.Params import org.apache.spark.ml.util.MLReader // This originates from apache-spark DefaultPramsReader copy paste private[spark] object DefaultXGBoostParamsReader { + private val logger = LogFactory.getLog("XGBoostSpark") + private val paramNameCompatibilityMap: Map[String, String] = Map("silent" -> "verbosity") private val paramValueCompatibilityMap: Map[String, Map[Any, Any]] = @@ -126,9 +130,16 @@ private[spark] object DefaultXGBoostParamsReader { metadata.params match { case JObject(pairs) => pairs.foreach { case (paramName, jsonValue) => - val param = instance.getParam(handleBrokenlyChangedName(paramName)) - val value = param.jsonDecode(compact(render(jsonValue))) - instance.set(param, handleBrokenlyChangedValue(paramName, value)) + val finalName = handleBrokenlyChangedName(paramName) + // For the deleted parameters, we'd better to remove it instead of throwing an exception. + // So we need to check if the parameter exists instead of blindly setting it. + if (instance.hasParam(finalName)) { + val param = instance.getParam(finalName) + val value = param.jsonDecode(compact(render(jsonValue))) + instance.set(param, handleBrokenlyChangedValue(paramName, value)) + } else { + logger.warn(s"$finalName is no longer used in ${spark.VERSION}") + } } case _ => throw new IllegalArgumentException( diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc new file mode 100644 index 0000000000000000000000000000000000000000..59fef7327848b1000552a71ad8b66a26b56c1bc5 GIT binary patch literal 20 ccmYc;N@ieSU}9)Vbh`L$OVibR^Dj6607j+=!2kdN literal 0 HcmV?d00001 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel new file mode 100644 index 0000000000000000000000000000000000000000..bcb7f8ddc8230c7dca0c0901bab604ab381a2cf6 GIT binary patch literal 1479 zcmeHH!D<3A5RFT<)`R_roTaQGIMWBDLVF=CD*i68{=XY{-CBOV_#NqfjDQ)=h68X;X4-U18*`b$yY zeKHwHb^k$Y+X>QzIVXJ6O-}qgckFI^r``PS8FCHy6~8atbBE*2ty02jB^{>Z;Bl#uis&d8;-n@HSmo0dLEx~ f3#Ev~VlS<%d3NMZ$qK~YIPn3xeDk4f&QGULLOOPq literal 0 HcmV?d00001 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc new file mode 100644 index 0000000000000000000000000000000000000000..6fc8d49f8bc2197d0bcca96e3c4f5cf66bf4bf73 GIT binary patch literal 20 ccmYc;N@ieSU}Ctsd)rT?&GUp@n^*G!07pFse*gdg literal 0 HcmV?d00001 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 new file mode 100644 index 000000000000..18f5241bfd7d --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel","timestamp":1650944298068,"sparkVersion":"3.0.1","uid":"xgbc_0c9bc7ccd44f","paramMap":{"numEarlyStoppingRounds":0,"numWorkers":1,"subsample":1.0,"dmlcWorkerConnectRetry":5,"rabitTimeout":-1,"predictionCol":"prediction","eta":0.3,"cacheTrainingSet":false,"allowNonZeroForMissing":true,"minChildWeight":1.0,"sketchEps":0.03,"trackerConf":{"workerConnectionTimeout":0,"trackerImpl":"python","hostIp":"","pythonExec":""},"verbosity":1,"alpha":0.0,"checkpointInterval":-1,"timeoutRequestWorkers":1800000,"killSparkContextOnWorkerFailure":true,"growPolicy":"depthwise","sampleType":"uniform","useExternalMemory":false,"silent":0,"skipDrop":0.0,"lambdaBias":0.0,"lambda":1.0,"customEval":null,"treeLimit":0,"trainTestRatio":1.0,"missing":"NaN","scalePosWeight":1.0,"evalMetric":"logloss","objective":"binary:logistic","seed":0,"gamma":0.0,"rabitRingReduceThreshold":32768,"treeMethod":"hist","maxBin":256,"rateDrop":0.0,"maxDepth":6,"customObj":null,"maxDeltaStep":0.0,"featuresCol":"features","baseScore":0.5,"colsampleBylevel":1.0,"batchSize":32768,"rawPredictionCol":"rawPrediction","nthread":1,"labelCol":"label","normalizeType":"tree","checkpointPath":"","numRound":5,"colsampleBytree":1.0,"handleInvalid":"error","probabilityCol":"probability"}} diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala index 93b7554017a0..803fb0d4bc96 100755 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala @@ -20,13 +20,15 @@ import java.io.File import java.util.Arrays import ml.dmlc.xgboost4j.scala.DMatrix - import scala.util.Random + import org.apache.spark.ml.feature._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.functions._ import org.scalatest.FunSuite +import org.apache.spark.ml.linalg.Vectors + class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest { test("test persistence of XGBoostClassifier and XGBoostClassificationModel") { @@ -191,5 +193,26 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest { } model.transform(df).show() } + + test("cross-version model loading (1.6.0)") { + // For each release, we should create the cross-version model testing + // Use below code to generate model + + // val paramMap = Map("objective" -> "binary:logistic", "num_round" -> 5, + // "num_workers" -> 1, "tree_method" -> "hist", "missing" -> Float.NaN, + // "allow_non_zero_for_missing" -> true) + // val df = ss.createDataFrame(Seq( + // (1.0, Vectors.dense(1.0)), + // (0.0, Vectors.dense(0.0)))).toDF("label", "features") + // val model = new XGBoostClassifier(paramMap).fit(df) + // model.write.overwrite().save("xgboost4j-spark/src/test/resources/model/1.6.0/model") + + val modelPath = getClass.getResource("/model/1.6.0/model").getPath + val df = ss.createDataFrame(Seq( + (1.0, Vectors.dense(1.0)), + (0.0, Vectors.dense(0.0)))).toDF("label", "features") + val model = XGBoostClassificationModel.read.load(modelPath) + model.transform(df).show() + } } From 379b53ae2eeb4c34bd90a621ca849ad0a722b9af Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Tue, 26 Apr 2022 15:12:10 +0800 Subject: [PATCH 2/2] rm binary file --- .../data/.XGBoostClassificationModel.crc | Bin 20 -> 0 bytes .../model/data/XGBoostClassificationModel | Bin 1479 -> 0 bytes .../model/1.6.0/model/metadata/._SUCCESS.crc | Bin 8 -> 0 bytes .../1.6.0/model/metadata/.part-00000.crc | Bin 20 -> 0 bytes .../model/1.6.0/model/metadata/_SUCCESS | 0 .../model/1.6.0/model/metadata/part-00000 | 1 - .../scala/spark/PersistenceSuite.scala | 25 +----------------- 7 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS delete mode 100644 jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/.XGBoostClassificationModel.crc deleted file mode 100644 index 59fef7327848b1000552a71ad8b66a26b56c1bc5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 ccmYc;N@ieSU}9)Vbh`L$OVibR^Dj6607j+=!2kdN diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/data/XGBoostClassificationModel deleted file mode 100644 index bcb7f8ddc8230c7dca0c0901bab604ab381a2cf6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1479 zcmeHH!D<3A5RFT<)`R_roTaQGIMWBDLVF=CD*i68{=XY{-CBOV_#NqfjDQ)=h68X;X4-U18*`b$yY zeKHwHb^k$Y+X>QzIVXJ6O-}qgckFI^r``PS8FCHy6~8atbBE*2ty02jB^{>Z;Bl#uis&d8;-n@HSmo0dLEx~ f3#Ev~VlS<%d3NMZ$qK~YIPn3xeDk4f&QGULLOOPq diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/.part-00000.crc deleted file mode 100644 index 6fc8d49f8bc2197d0bcca96e3c4f5cf66bf4bf73..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 ccmYc;N@ieSU}Ctsd)rT?&GUp@n^*G!07pFse*gdg diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/_SUCCESS deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 b/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 deleted file mode 100644 index 18f5241bfd7d..000000000000 --- a/jvm-packages/xgboost4j-spark/src/test/resources/model/1.6.0/model/metadata/part-00000 +++ /dev/null @@ -1 +0,0 @@ -{"class":"ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel","timestamp":1650944298068,"sparkVersion":"3.0.1","uid":"xgbc_0c9bc7ccd44f","paramMap":{"numEarlyStoppingRounds":0,"numWorkers":1,"subsample":1.0,"dmlcWorkerConnectRetry":5,"rabitTimeout":-1,"predictionCol":"prediction","eta":0.3,"cacheTrainingSet":false,"allowNonZeroForMissing":true,"minChildWeight":1.0,"sketchEps":0.03,"trackerConf":{"workerConnectionTimeout":0,"trackerImpl":"python","hostIp":"","pythonExec":""},"verbosity":1,"alpha":0.0,"checkpointInterval":-1,"timeoutRequestWorkers":1800000,"killSparkContextOnWorkerFailure":true,"growPolicy":"depthwise","sampleType":"uniform","useExternalMemory":false,"silent":0,"skipDrop":0.0,"lambdaBias":0.0,"lambda":1.0,"customEval":null,"treeLimit":0,"trainTestRatio":1.0,"missing":"NaN","scalePosWeight":1.0,"evalMetric":"logloss","objective":"binary:logistic","seed":0,"gamma":0.0,"rabitRingReduceThreshold":32768,"treeMethod":"hist","maxBin":256,"rateDrop":0.0,"maxDepth":6,"customObj":null,"maxDeltaStep":0.0,"featuresCol":"features","baseScore":0.5,"colsampleBylevel":1.0,"batchSize":32768,"rawPredictionCol":"rawPrediction","nthread":1,"labelCol":"label","normalizeType":"tree","checkpointPath":"","numRound":5,"colsampleBytree":1.0,"handleInvalid":"error","probabilityCol":"probability"}} diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala index 803fb0d4bc96..93b7554017a0 100755 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala @@ -20,15 +20,13 @@ import java.io.File import java.util.Arrays import ml.dmlc.xgboost4j.scala.DMatrix -import scala.util.Random +import scala.util.Random import org.apache.spark.ml.feature._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.functions._ import org.scalatest.FunSuite -import org.apache.spark.ml.linalg.Vectors - class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest { test("test persistence of XGBoostClassifier and XGBoostClassificationModel") { @@ -193,26 +191,5 @@ class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest { } model.transform(df).show() } - - test("cross-version model loading (1.6.0)") { - // For each release, we should create the cross-version model testing - // Use below code to generate model - - // val paramMap = Map("objective" -> "binary:logistic", "num_round" -> 5, - // "num_workers" -> 1, "tree_method" -> "hist", "missing" -> Float.NaN, - // "allow_non_zero_for_missing" -> true) - // val df = ss.createDataFrame(Seq( - // (1.0, Vectors.dense(1.0)), - // (0.0, Vectors.dense(0.0)))).toDF("label", "features") - // val model = new XGBoostClassifier(paramMap).fit(df) - // model.write.overwrite().save("xgboost4j-spark/src/test/resources/model/1.6.0/model") - - val modelPath = getClass.getResource("/model/1.6.0/model").getPath - val df = ss.createDataFrame(Seq( - (1.0, Vectors.dense(1.0)), - (0.0, Vectors.dense(0.0)))).toDF("label", "features") - val model = XGBoostClassificationModel.read.load(modelPath) - model.transform(df).show() - } }