Skip to content

Commit

Permalink
Fix pyspark parameter. (#9460)
Browse files Browse the repository at this point in the history
- Don't pass the `use_gpu` parameter to the learner.
- Fix GPU approx with PySpark.
  • Loading branch information
trivialfis authored Aug 11, 2023
1 parent 428f6cb commit bdc1a3c
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 9 deletions.
7 changes: 3 additions & 4 deletions python-package/xgboost/spark/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"qid_col",
"repartition_random_shuffle",
"pred_contrib_col",
"use_gpu",
]

_non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
Expand Down Expand Up @@ -349,11 +350,9 @@ def _validate_params(self) -> None:
)

tree_method = self.getOrDefault(self.getParam("tree_method"))
if (
self.getOrDefault(self.use_gpu) or use_cuda(self.getOrDefault(self.device))
) and not _can_use_qdm(tree_method):
if tree_method == "exact":
raise ValueError(
f"The `{tree_method}` tree method is not supported on GPU."
"The `exact` tree method is not supported for distributed systems."
)

if self.getOrDefault(self.features_cols):
Expand Down
10 changes: 8 additions & 2 deletions tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,18 @@ def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
return train_df, test_df, data.feature_names


def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_sparkxgb_classifier_with_gpu(tree_method: str, spark_iris_dataset) -> None:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

classifier = SparkXGBClassifier(device="cuda", num_workers=num_workers)
classifier = SparkXGBClassifier(
device="cuda", num_workers=num_workers, tree_method=tree_method
)
train_df, test_df = spark_iris_dataset
model = classifier.fit(train_df)
config = json.loads(model.get_booster().save_config())
ctx = config["learner"]["generic_param"]
assert ctx["device"] == "cuda:0"
pred_result_df = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1 = evaluator.evaluate(pred_result_df)
Expand Down
8 changes: 5 additions & 3 deletions tests/test_distributed/test_with_spark/test_spark_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,9 @@ def check_sub_dict_match(
assert sub_dist[k] == whole_dict[k], f"check on {k} failed"


def get_params_map(params_kv: dict, estimator: Type) -> dict:
def get_params_map(
params_kv: dict, estimator: xgb.spark.core._SparkXGBEstimator
) -> dict:
return {getattr(estimator, k): v for k, v in params_kv.items()}


Expand Down Expand Up @@ -870,10 +872,10 @@ def test_regressor_model_pipeline_save_load(self, reg_data: RegData) -> None:

def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
clf = SparkXGBClassifier(device="cuda", tree_method="exact")
with pytest.raises(ValueError, match="not supported on GPU"):
with pytest.raises(ValueError, match="not supported for distributed"):
clf.fit(clf_data.cls_df_train)
regressor = SparkXGBRegressor(device="cuda", tree_method="exact")
with pytest.raises(ValueError, match="not supported on GPU"):
with pytest.raises(ValueError, match="not supported for distributed"):
regressor.fit(reg_data.reg_df_train)

reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")
Expand Down

0 comments on commit bdc1a3c

Please sign in to comment.