Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API/ENH: union Categorical #13361

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from .pandas_vb_common import *
try:
from pandas.types.concat import union_categoricals
except ImportError:
pass
import string


Expand All @@ -12,6 +16,17 @@ def time_concat_categorical(self):
concat([self.s, self.s])


class union_categorical(object):
goal_time = 0.2

def setup(self):
self.a = pd.Categorical((list('aabbcd') * 1000000))
self.b = pd.Categorical((list('bbcdjk') * 1000000))

def time_union_categorical(self):
union_categoricals([self.a, self.b])


class categorical_value_counts(object):
goal_time = 1

Expand Down
25 changes: 25 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,31 @@ In this case the categories are not the same and so an error is raised:

The same applies to ``df.append(df_different)``.

.. _categorical.union:

Unioning
~~~~~~~~

.. versionadded:: 0.18.2

If you want to combine categoricals that do not necessarily have
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

versionadded tag here

the same categories, the `union_categorical` function will
combine a list-like of categoricals. The new categories
will be the union of the categories being combined.

.. ipython:: python

from pandas.types.concat import union_categoricals
a = pd.Categorical(["b", "c"])
b = pd.Categorical(["a", "b"])
union_categoricals([a, b])

.. note::

`union_categoricals` only works with unordered categoricals
and will raise if any are ordered.


Getting Data In/Out
-------------------

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ Other enhancements

- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)

- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)

Expand Down
51 changes: 50 additions & 1 deletion pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from pandas import (DataFrame, concat,
read_csv, isnull, Series, date_range,
Index, Panel, MultiIndex, Timestamp,
DatetimeIndex)
DatetimeIndex, Categorical)
from pandas.types.concat import union_categoricals
from pandas.util import testing as tm
from pandas.util.testing import (assert_frame_equal,
makeCustomDataframe as mkdf,
Expand Down Expand Up @@ -919,6 +920,54 @@ def test_concat_keys_with_none(self):
keys=['b', 'c', 'd', 'e'])
tm.assert_frame_equal(result, expected)

def test_union_categorical(self):
# GH 13361
data = [
(list('abc'), list('abd'), list('abcabd')),
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),

(pd.date_range('2014-01-01', '2014-01-05'),
pd.date_range('2014-01-06', '2014-01-07'),
pd.date_range('2014-01-01', '2014-01-07')),

(pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),

(pd.period_range('2014-01-01', '2014-01-05'),
pd.period_range('2014-01-06', '2014-01-07'),
pd.period_range('2014-01-01', '2014-01-07')),
]

for a, b, combined in data:
result = union_categoricals([Categorical(a), Categorical(b)])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected,
check_category_order=True)

# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2]).categories
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_index_equal(result, expected)

# can't be ordered
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
with tm.assertRaises(TypeError):
union_categoricals([s, s2])

# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
with tm.assertRaises(TypeError):
union_categoricals([s, s2])

with tm.assertRaises(ValueError):
union_categoricals([])

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
51 changes: 51 additions & 0 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,57 @@ def convert_categorical(x):
return Categorical(concatted, rawcats)


def union_categoricals(to_union):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a ignore_order=False kwarg? That would prevent changed/copied orig categoricals if you ever need this... Would only disable the order check

if not ignore_order and any(c.ordered for c in to_union):
        raise TypeError("Can only combine unordered Categoricals")

It would still return a unordered cat, of course.

"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag

Combine list-like of Categoricals, unioning categories. All
must have the same dtype, and none can be ordered.

.. versionadded 0.18.2

Parameters
----------
to_union : list-like of Categoricals

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add Raises (and list when that happens)

Returns
-------
Categorical
A single array, categories will be ordered as they
appear in the list

Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
ValueError
Emmpty list of categoricals passed
"""
from pandas import Index, Categorical

if len(to_union) == 0:
raise ValueError('No Categoricals to union')

first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")

if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
raise TypeError("dtype of categories must be the same")

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.unique() returns a NumPy array, which should fail the tests for non-supported dtypes. I think you need .drop_duplicates() here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if the tests pass, I am clearly confused.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests will (should!) pass, because I'm wrapping back in an index, and the non-numpy index types have .unique() overloaded to return an Index. But it likely is cleaner to use .drop_duplicates()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should just and use pd.unique directly I think
less copies that way too

append is meant for 2 indexes only

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually drop_duplicates is quite a bit slower, it looks like it's coercing to object rather than using the hash tables that unique does.

In [74]: %timeit Index(a).append(Index(b)).drop_duplicates()
10 loops, best of 3: 197 ms per loop

In [75]: %timeit Index(Index(a).append(Index(b)).unique())
10 loops, best of 3: 71.5 ms per loop

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear it's a bit messier than that, idx.unique() returns an Index type for categorical, datetime (tz), and period. But it returns a numpy array for ints/floats/object (maybe this should be changed?), which is why I have to wrap the results in Index again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is an issue for that - if not pls create one

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops -- I just opened a new one: #13395

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok close that new one - I think there is one about series as well - this is an old compat thing

categories = Index(unique_cats)

new_codes = []
for c in to_union:
indexer = categories.get_indexer(c.categories)
new_codes.append(indexer.take(c.codes))
codes = np.concatenate(new_codes)
return Categorical(codes, categories=categories, ordered=False,
fastpath=True)


def _concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
Expand Down
36 changes: 31 additions & 5 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,14 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''):


def assert_categorical_equal(left, right, check_dtype=True,
obj='Categorical'):
obj='Categorical', check_category_order=True):
"""Test that categoricals are eqivalent

Parameters
----------
left, right : Categorical
Categoricals to compare
check_dtype : bool, default True
Check that integer dtype of the codes are the same
obj : str, default 'Categorical'
Specify object name being compared, internally used to show appropriate
assertion message
check_category_order : bool, default True
Whether the order of the categories should be compared, which
implies identical integer codes. If False, only the resulting
values are compared. The ordered attribute is
checked regardless.
"""
assertIsInstance(left, pd.Categorical, '[Categorical] ')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string

assertIsInstance(right, pd.Categorical, '[Categorical] ')

assert_index_equal(left.categories, right.categories,
obj='{0}.categories'.format(obj))
assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
obj='{0}.codes'.format(obj))
if check_category_order:
assert_index_equal(left.categories, right.categories,
obj='{0}.categories'.format(obj))
assert_numpy_array_equal(left.codes, right.codes,
check_dtype=check_dtype,
obj='{0}.codes'.format(obj))
else:
assert_index_equal(left.categories.sort_values(),
right.categories.sort_values(),
obj='{0}.categories'.format(obj))
assert_index_equal(left.categories.take(left.codes),
right.categories.take(right.codes),
obj='{0}.values'.format(obj))

assert_attr_equal('ordered', left, right, obj=obj)

Expand Down