Skip to content

Commit

Permalink
[SPARK-43875][PS][TESTS] Enabling Categorical tests for Pandas 2.0.0 …
Browse files Browse the repository at this point in the history
…and above

### What changes were proposed in this pull request?

This PR proposes to enable Categorical tests for pandas 2.0.0 and above. See https://pandas.pydata.org/docs/whatsnew/v2.0.0.html for more detail.

### Why are the changes needed?

To match the behavior with pandas 2.0.0 and above.

### Does this PR introduce _any_ user-facing change?

No, this is test-only.

### How was this patch tested?

Enabling & updating the existing UTs.

Closes apache#42530 from itholic/pandas_categorical_test.

Authored-by: itholic <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
itholic authored and zhengruifeng committed Aug 18, 2023
1 parent 48faaa8 commit c321b3d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 35 deletions.
26 changes: 4 additions & 22 deletions python/pyspark/pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@ def test_categorical_index(self):
):
ps.CategoricalIndex([1, 2, 3]).all()

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43568): Enable CategoricalIndexTests.test_categories_setter for pandas 2.0.0.",
)
def test_categories_setter(self):
pdf = pd.DataFrame(
{
Expand All @@ -92,20 +88,10 @@ def test_categories_setter(self):
pidx = pdf.index
psidx = psdf.index

pidx.categories = ["z", "y", "x"]
psidx.categories = ["z", "y", "x"]
# Pandas deprecated all the in-place category-setting behaviors, dtypes also not be
# refreshed in categories.setter since Pandas 1.4+, we should also consider to clean up
# this test when in-place category-setting removed:
# https://github.com/pandas-dev/pandas/issues/46820
if LooseVersion("1.4") >= LooseVersion(pd.__version__) >= LooseVersion("1.1"):
self.assert_eq(pidx, psidx)
self.assert_eq(pdf, psdf)
else:
pidx = pidx.set_categories(pidx.categories)
pdf.index = pidx
self.assert_eq(pidx, psidx)
self.assert_eq(pdf, psdf)
pidx = pidx.rename_categories(["z", "y", "x"])
psidx = psidx.rename_categories(["z", "y", "x"])
self.assert_eq(pidx, psidx)
self.assert_eq(pdf, psdf)

with self.assertRaises(ValueError):
psidx.categories = [1, 2, 3, 4]
Expand All @@ -122,10 +108,6 @@ def test_add_categories(self):
self.assertRaises(ValueError, lambda: psidx.add_categories(3))
self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4]))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43633): Enable CategoricalIndexTests.test_remove_categories for pandas 2.0.0.",
)
def test_remove_categories(self):
pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1])
psidx = ps.from_pandas(pidx)
Expand Down
17 changes: 4 additions & 13 deletions python/pyspark/pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ def test_astype(self):

self.assert_eq(pscser.astype(str), pcser.astype(str))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43564): Enable CategoricalTests.test_factorize for pandas 2.0.0.",
)
def test_factorize(self):
pser = pd.Series(["a", "b", "c", None], dtype=CategoricalDtype(["c", "a", "d", "b"]))
psser = ps.from_pandas(pser)
Expand All @@ -212,8 +208,8 @@ def test_factorize(self):
self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)

pcodes, puniques = pser.factorize(na_sentinel=-2)
kcodes, kuniques = psser.factorize(na_sentinel=-2)
pcodes, puniques = pser.factorize(use_na_sentinel=-2)
kcodes, kuniques = psser.factorize(use_na_sentinel=-2)

self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)
Expand Down Expand Up @@ -345,11 +341,6 @@ def test_groupby_apply(self):
# psdf.groupby("a").apply(len).sort_index(), pdf.groupby("a").apply(len).sort_index(),
# )

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43813): Enable CategoricalTests.test_groupby_apply_without_shortcut "
"for pandas 2.0.0.",
)
def test_groupby_apply_without_shortcut(self):
with ps.option_context("compute.shortcut_limit", 0):
self.test_groupby_apply()
Expand All @@ -360,8 +351,8 @@ def identity(df) -> ps.DataFrame[zip(psdf.columns, psdf.dtypes)]:
return df

self.assert_eq(
psdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
pdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
psdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
pdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
)

def test_groupby_transform(self):
Expand Down

0 comments on commit c321b3d

Please sign in to comment.