ENH union_categoricals supports ignore_order GH13410

xref #13410 (ignore_order portion) Author: Justin Solinsky <[email protected]> Closes #15219 from js3711/GH13410-ENHunion_categoricals and squashes the following commits: e9d00de [Justin Solinsky] GH15219 Documentation fixes based on feedback d278d62 [Justin Solinsky] ENH union_categoricals supports ignore_order GH13410 9b827ef [Justin Solinsky] ENH union_categoricals supports ignore_order GH13410
pandas-dev · Feb 22, 2017 · 14fee4f · 14fee4f
1 parent 486e384
commit 14fee4f
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 4 deletions.
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -693,6 +693,17 @@ The below raises ``TypeError`` because the categories are ordered and not identi
    Out[3]:
    TypeError: to union ordered Categoricals, all categories must be the same
 
+.. versionadded:: 0.20.0
+
+Ordered categoricals with different categories or orderings can be combined by
+using the ``ignore_ordered=True`` argument.
+
+.. ipython:: python
+
+    a = pd.Categorical(["a", "b", "c"], ordered=True)
+    b = pd.Categorical(["c", "b", "a"], ordered=True)
+    union_categoricals([a, b], ignore_order=True)
+
 ``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing
 categorical data, but note that the resulting array will always be a plain ``Categorical``
 

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -156,9 +156,11 @@ Other enhancements
 - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`)
 - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`)
 - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
+- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
+
 .. _whatsnew_0200.api_breaking:
 
 

diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py
@@ -1662,6 +1662,60 @@ def test_union_categoricals_ordered(self):
         with tm.assertRaisesRegexp(TypeError, msg):
             union_categoricals([c1, c2])
 
+    def test_union_categoricals_ignore_order(self):
+        # GH 15219
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], ordered=False)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        msg = 'Categorical.ordered must be the same'
+        with tm.assertRaisesRegexp(TypeError, msg):
+            union_categoricals([c1, c2], ignore_order=False)
+
+        res = union_categoricals([c1, c1], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals([c1, c1], ignore_order=False)
+        exp = Categorical([1, 2, 3, 1, 2, 3],
+                          categories=[1, 2, 3], ordered=True)
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
+        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, np.nan, 3, 2])
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals([c2, c1], ignore_order=True,
+                                 sort_categories=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([4, 5, 6], ordered=True)
+        result = union_categoricals([c1, c2], ignore_order=True)
+        expected = Categorical([1, 2, 3, 4, 5, 6])
+        tm.assert_categorical_equal(result, expected)
+
+        msg = "to union ordered Categoricals, all categories must be the same"
+        with tm.assertRaisesRegexp(TypeError, msg):
+            union_categoricals([c1, c2], ignore_order=False)
+
+        with tm.assertRaisesRegexp(TypeError, msg):
+            union_categoricals([c1, c2])
+
     def test_union_categoricals_sort(self):
         # GH 13846
         c1 = Categorical(['x', 'y', 'z'])

diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -208,7 +208,7 @@ def _concat_asobject(to_concat):
     return _concat_asobject(to_concat)
 
 
-def union_categoricals(to_union, sort_categories=False):
+def union_categoricals(to_union, sort_categories=False, ignore_order=False):
     """
     Combine list-like of Categorical-like, unioning categories. All
     categories must have the same dtype.
@@ -222,6 +222,11 @@ def union_categoricals(to_union, sort_categories=False):
     sort_categories : boolean, default False
         If true, resulting categories will be lexsorted, otherwise
         they will be ordered as they appear in the data.
+    ignore_order: boolean, default False
+        If true, the ordered attribute of the Categoricals will be ignored.
+        Results in an unordered categorical.
+
+        .. versionadded:: 0.20.0
 
     Returns
     -------
@@ -235,7 +240,7 @@ def union_categoricals(to_union, sort_categories=False):
         - all inputs are ordered and their categories are not identical
         - sort_categories=True and Categoricals are ordered
     ValueError
-        Emmpty list of categoricals passed
+        Empty list of categoricals passed
     """
     from pandas import Index, Categorical, CategoricalIndex, Series
 
@@ -264,15 +269,15 @@ def _maybe_unwrap(x):
         ordered = first.ordered
         new_codes = np.concatenate([c.codes for c in to_union])
 
-        if sort_categories and ordered:
+        if sort_categories and not ignore_order and ordered:
             raise TypeError("Cannot use sort_categories=True with "
                             "ordered Categoricals")
 
         if sort_categories and not categories.is_monotonic_increasing:
             categories = categories.sort_values()
             indexer = categories.get_indexer(first.categories)
             new_codes = take_1d(indexer, new_codes, fill_value=-1)
-    elif all(not c.ordered for c in to_union):
+    elif ignore_order or all(not c.ordered for c in to_union):
         # different categories - union and recode
         cats = first.categories.append([c.categories for c in to_union[1:]])
         categories = Index(cats.unique())
@@ -297,6 +302,9 @@ def _maybe_unwrap(x):
         else:
             raise TypeError('Categorical.ordered must be the same')
 
+    if ignore_order:
+        ordered = False
+
     return Categorical(new_codes, categories=categories, ordered=ordered,
                        fastpath=True)