From 9cadc4e84172cd89effa0b02dc188c91d8314110 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 23 Jul 2016 12:10:21 +0900 Subject: [PATCH] ENH: union_categorical supports identical categories with ordered --- doc/source/categorical.rst | 7 ++-- pandas/tools/tests/test_concat.py | 61 +++++++++++++++++++++++++++---- pandas/types/concat.py | 23 ++++++++++-- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index f0e01ddc3fc2d..da9c707e07552 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -669,9 +669,10 @@ will be the union of the categories being combined. .. note:: - `union_categoricals` only works with unordered categoricals - and will raise if any are ordered. - + In addition to the "easy" case of combining two categoricals of the same + categories and order information (e.g. what you could also ``append`` for), + ``union_categoricals`` only works with unordered categoricals and will + raise if any are ordered. Getting Data In/Out ------------------- diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 13c6b72ade27b..84cbe59508b04 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -870,23 +870,26 @@ def test_union_categorical(self): # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) s2 = Categorical(['a', 'b', 'c']) - result = union_categoricals([s, s2]).categories - expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) - tm.assert_index_equal(result, expected) + result = union_categoricals([s, s2]) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) - # can't be ordered s = Categorical([0, 1.2, 2], ordered=True) s2 = Categorical([0, 1.2, 2], ordered=True) - with tm.assertRaises(TypeError): - union_categoricals([s, s2]) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) + tm.assert_categorical_equal(result, expected) # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) - with tm.assertRaises(TypeError): + msg = 'dtype of categories must be the same' + with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([s, s2]) - with tm.assertRaises(ValueError): + msg = 'No Categoricals to union' + with tm.assertRaisesRegexp(ValueError, msg): union_categoricals([]) def test_union_categoricals_nan(self): @@ -942,6 +945,48 @@ def test_union_categoricals_empty(self): pd.Categorical([])]) tm.assert_categorical_equal(res, nanc) + def test_union_categorical_same_category(self): + # check fastpath + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], + categories=[1, 2, 3, 4]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) + c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + res = union_categoricals([c1, c2]) + exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], + categories=['x', 'y', 'z']) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_ordered(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + res = union_categoricals([c1, c1]) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index c8af0ec62db86..e860ba3e201e9 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -231,8 +231,9 @@ def union_categoricals(to_union): Raises ------ TypeError - If any of the categoricals are ordered or all do not - have the same dtype + - all inputs do not have the same dtype + - all inputs do not have the same ordered property + - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ @@ -242,13 +243,27 @@ def union_categoricals(to_union): raise ValueError('No Categoricals to union') first = to_union[0] - if any(c.ordered for c in to_union): - raise TypeError("Can only combine unordered Categoricals") if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") + if all(first.is_dtype_equal(other) for other in to_union[1:]): + return Categorical(np.concatenate([c.codes for c in to_union]), + categories=first.categories, ordered=first.ordered, + fastpath=True) + elif all(not c.ordered for c in to_union): + # not ordered + pass + else: + # to show a proper error message + if all(c.ordered for c in to_union): + msg = ("to union ordered Categoricals, " + "all categories must be the same") + raise TypeError(msg) + else: + raise TypeError('Categorical.ordered must be the same') + cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats)