Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: union_categorical supports identical categories with ordered #13763

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,10 @@ will be the union of the categories being combined.

.. note::

`union_categoricals` only works with unordered categoricals
and will raise if any are ordered.

In addition to the "easy" case of combining two categoricals of the same
categories and order information (e.g. what you could also ``append`` for),
``union_categoricals`` only works with unordered categoricals and will
raise if any are ordered.

Getting Data In/Out
-------------------
Expand Down
61 changes: 53 additions & 8 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,23 +870,26 @@ def test_union_categorical(self):
# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2]).categories
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_index_equal(result, expected)
result = union_categoricals([s, s2])
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

# can't be ordered
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
with tm.assertRaises(TypeError):
union_categoricals([s, s2])
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)

# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
with tm.assertRaises(TypeError):
msg = 'dtype of categories must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([s, s2])

with tm.assertRaises(ValueError):
msg = 'No Categoricals to union'
with tm.assertRaisesRegexp(ValueError, msg):
union_categoricals([])

def test_union_categoricals_nan(self):
Expand Down Expand Up @@ -942,6 +945,48 @@ def test_union_categoricals_empty(self):
pd.Categorical([])])
tm.assert_categorical_equal(res, nanc)

def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)

c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
res = union_categoricals([c1, c2])
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
categories=['x', 'y', 'z'])
tm.assert_categorical_equal(res, exp)

def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)

msg = 'Categorical.ordered must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

msg = "to union ordered Categoricals, all categories must be the same"
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
23 changes: 19 additions & 4 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,9 @@ def union_categoricals(to_union):
Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
ValueError
Emmpty list of categoricals passed
"""
Expand All @@ -242,13 +243,27 @@ def union_categoricals(to_union):
raise ValueError('No Categoricals to union')

first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")

if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
raise TypeError("dtype of categories must be the same")

if all(first.is_dtype_equal(other) for other in to_union[1:]):
return Categorical(np.concatenate([c.codes for c in to_union]),
categories=first.categories, ordered=first.ordered,
fastpath=True)
elif all(not c.ordered for c in to_union):
# not ordered
pass
else:
# to show a proper error message
if all(c.ordered for c in to_union):
msg = ("to union ordered Categoricals, "
"all categories must be the same")
raise TypeError(msg)
else:
raise TypeError('Categorical.ordered must be the same')

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)
Expand Down