From 9027686cac6b91f90e76c70b5aa6511f96c6e2c6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 11 Sep 2023 17:44:51 +0800
Subject: [PATCH] Support pandas 2.1.0. (#9557)

---
 python-package/xgboost/data.py   | 66 +++++++++++++++++++++-----------
 tests/python/test_with_pandas.py |  8 ++--
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 04bdc5739eaa..428e48d1065d 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -317,7 +317,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
     """Handle feature info for pandas dataframe."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype, is_sparse
 
     # handle feature names
     if feature_names is None and meta is None:
@@ -332,10 +331,10 @@ def pandas_feature_info(
     if feature_types is None and meta is None:
         feature_types = []
         for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                 feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
             elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
             ) and enable_categorical:
                 feature_types.append(CAT_T)
             else:
@@ -345,18 +344,13 @@ def pandas_feature_info(
 
 def is_nullable_dtype(dtype: PandasDType) -> bool:
     """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
-        is_bool_dtype,
-        is_categorical_dtype,
-        is_float_dtype,
-        is_integer_dtype,
-    )
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
 
     is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
     # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
     is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
     is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
 
 
 def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -371,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
     )
 
 
+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import CategoricalDtype
+
+            return isinstance(dtype, CategoricalDtype)
+
+    from pandas.api.types import is_categorical_dtype
+
+    return is_categorical_dtype(dtype)
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import SparseDtype
+
+            return isinstance(dtype, SparseDtype)
+
+    from pandas.api.types import is_sparse
+
+    return is_sparse(dtype)
+
+
 def pandas_cat_null(data: DataFrame) -> DataFrame:
     """Handle categorical dtype and nullable extension types from pandas."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype
 
     # handle category codes and nullable.
     cat_columns = []
     nul_columns = []
     # avoid an unnecessary conversion if possible
     for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
             cat_columns.append(col)
         elif is_pa_ext_categorical_dtype(dtype):
             raise ValueError(
@@ -398,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
         transformed = data
 
     def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
             return ser.cat.codes
         assert is_pa_ext_categorical_dtype(ser.dtype)
         # Not yet supported, the index is not ordered for some reason. Alternately:
@@ -454,14 +479,12 @@ def _transform_pandas_df(
     meta: Optional[str] = None,
     meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    from pandas.api.types import is_categorical_dtype, is_sparse
-
     pyarrow_extension = False
     for dtype in data.dtypes:
         if not (
             (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or is_pd_sparse_dtype(dtype)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
             or is_pa_ext_dtype(dtype)
         ):
             _invalid_dataframe_dtype(data)
@@ -515,9 +538,8 @@ def _meta_from_pandas_series(
 ) -> None:
     """Help transform pandas series for meta data like labels"""
     data = data.values.astype("float")
-    from pandas.api.types import is_sparse
 
-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
         data = data.to_dense()  # type: ignore
     assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
     _meta_from_numpy(data, name, dtype, handle)
@@ -539,13 +561,11 @@ def _from_pandas_series(
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    from pandas.api.types import is_categorical_dtype
-
     if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
     ):
         _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
         data = data.cat.codes
     return _from_numpy_array(
         data.values.reshape(data.shape[0], 1).astype("float"),
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index f8a21b6ab923..a23a66b63a55 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -211,7 +211,7 @@ def test_pandas_weight(self):
         y = np.random.randn(kRows)
         w = np.random.uniform(size=kRows).astype(np.float32)
         w_pd = pd.DataFrame(w)
-        data = xgb.DMatrix(X, y, w_pd)
+        data = xgb.DMatrix(X, y, weight=w_pd)
 
         assert data.num_row() == kRows
         assert data.num_col() == kCols
@@ -301,14 +301,14 @@ def test_cv_as_pandas(self):
 
     @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
     def test_nullable_type(self, DMatrixT) -> None:
-        from pandas.api.types import is_categorical_dtype
+        from xgboost.data import is_pd_cat_dtype
 
         for orig, df in pd_dtypes():
             if hasattr(df.dtypes, "__iter__"):
-                enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
+                enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
             else:
                 # series
-                enable_categorical = is_categorical_dtype(df.dtype)
+                enable_categorical = is_pd_cat_dtype(df.dtype)
 
             f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
             f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df