pandas-dev · chris-b1 · Jun 4, 2016 · Jun 4, 2016 · Jun 5, 2016 · Jun 7, 2016
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,4 +1,8 @@
 from .pandas_vb_common import *
+try:
+    from pandas.types.concat import union_categoricals
+except ImportError:
+    pass
 import string
 
 
@@ -12,6 +16,17 @@ def time_concat_categorical(self):
         concat([self.s, self.s])
 
 
+class union_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.a = pd.Categorical((list('aabbcd') * 1000000))
+        self.b = pd.Categorical((list('bbcdjk') * 1000000))
+
+    def time_union_categorical(self):
+        union_categoricals([self.a, self.b])
+
+
 class categorical_value_counts(object):
     goal_time = 1
 

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -648,6 +648,31 @@ In this case the categories are not the same and so an error is raised:
 
 The same applies to ``df.append(df_different)``.
 
+.. _categorical.union:
+
+Unioning
+~~~~~~~~
+
+.. versionadded:: 0.18.2
+
+If you want to combine categoricals that do not necessarily have
+the same categories, the `union_categorical` function will
+combine a list-like of categoricals. The new categories
+will be the union of the categories being combined.
+
+.. ipython:: python
+
+    from pandas.types.concat import union_categoricals
+    a = pd.Categorical(["b", "c"])
+    b = pd.Categorical(["a", "b"])
+    union_categoricals([a, b])
+
+.. note::
+
+   `union_categoricals` only works with unordered categoricals
+   and will raise if any are ordered.
+
+
 Getting Data In/Out
 -------------------
 

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -90,7 +90,7 @@ Other enhancements
 
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-
+- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
 - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules.  New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 

diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
@@ -9,7 +9,8 @@
 from pandas import (DataFrame, concat,
                     read_csv, isnull, Series, date_range,
                     Index, Panel, MultiIndex, Timestamp,
-                    DatetimeIndex)
+                    DatetimeIndex, Categorical)
+from pandas.types.concat import union_categoricals
 from pandas.util import testing as tm
 from pandas.util.testing import (assert_frame_equal,
                                  makeCustomDataframe as mkdf,
@@ -919,6 +920,54 @@ def test_concat_keys_with_none(self):
                           keys=['b', 'c', 'd', 'e'])
         tm.assert_frame_equal(result, expected)
 
+    def test_union_categorical(self):
+        # GH 13361
+        data = [
+            (list('abc'), list('abd'), list('abcabd')),
+            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
+            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
+
+            (pd.date_range('2014-01-01', '2014-01-05'),
+             pd.date_range('2014-01-06', '2014-01-07'),
+             pd.date_range('2014-01-01', '2014-01-07')),
+
+            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
+             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
+             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
+
+            (pd.period_range('2014-01-01', '2014-01-05'),
+             pd.period_range('2014-01-06', '2014-01-07'),
+             pd.period_range('2014-01-01', '2014-01-07')),
+        ]
+
+        for a, b, combined in data:
+            result = union_categoricals([Categorical(a), Categorical(b)])
+            expected = Categorical(combined)
+            tm.assert_categorical_equal(result, expected,
+                                        check_category_order=True)
+
+        # new categories ordered by appearance
+        s = Categorical(['x', 'y', 'z'])
+        s2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([s, s2]).categories
+        expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
+        tm.assert_index_equal(result, expected)
+
+        # can't be ordered
+        s = Categorical([0, 1.2, 2], ordered=True)
+        s2 = Categorical([0, 1.2, 2], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        # must exactly match types
+        s = Categorical([0, 1.2, 2])
+        s2 = Categorical([2, 3, 4])
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        with tm.assertRaises(ValueError):
+            union_categoricals([])
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]

diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -201,6 +201,57 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
+def union_categoricals(to_union):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    .. versionadded 0.18.2
+
+    Parameters
+    ----------
+    to_union : list-like of Categoricals
+
+    Returns
+    -------
+    Categorical
+       A single array, categories will be ordered as they
+       appear in the list
+
+    Raises
+    ------
+    TypeError
+        If any of the categoricals are ordered or all do not
+        have the same dtype
+    ValueError
+        Emmpty list of categoricals passed
+    """
+    from pandas import Index, Categorical
+
+    if len(to_union) == 0:
+        raise ValueError('No Categoricals to union')
+
+    first = to_union[0]
+    if any(c.ordered for c in to_union):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
+               for c in to_union):
+        raise TypeError("dtype of categories must be the same")
+
+    cats = first.categories
+    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
+    categories = Index(unique_cats)
+
+    new_codes = []
+    for c in to_union:
+        indexer = categories.get_indexer(c.categories)
+        new_codes.append(indexer.take(c.codes))
+    codes = np.concatenate(new_codes)
+    return Categorical(codes, categories=categories, ordered=False,
+                       fastpath=True)
+
+
 def _concat_datetime(to_concat, axis=0, typs=None):
     """
     provide concatenation of an datetimelike array of arrays each of which is a

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -963,14 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''):
 
 
 def assert_categorical_equal(left, right, check_dtype=True,
-                             obj='Categorical'):
+                             obj='Categorical', check_category_order=True):
+    """Test that categoricals are eqivalent
+
+    Parameters
+    ----------
+    left, right : Categorical
+        Categoricals to compare
+    check_dtype : bool, default True
+        Check that integer dtype of the codes are the same
+    obj : str, default 'Categorical'
+        Specify object name being compared, internally used to show appropriate
+        assertion message
+    check_category_order : bool, default True
+        Whether the order of the categories should be compared, which
+        implies identical integer codes.  If False, only the resulting
+        values are compared.  The ordered attribute is
+        checked regardless.
+    """
     assertIsInstance(left, pd.Categorical, '[Categorical] ')
     assertIsInstance(right, pd.Categorical, '[Categorical] ')
 
-    assert_index_equal(left.categories, right.categories,
-                       obj='{0}.categories'.format(obj))
-    assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
-                             obj='{0}.codes'.format(obj))
+    if check_category_order:
+        assert_index_equal(left.categories, right.categories,
+                           obj='{0}.categories'.format(obj))
+        assert_numpy_array_equal(left.codes, right.codes,
+                                 check_dtype=check_dtype,
+                                 obj='{0}.codes'.format(obj))
+    else:
+        assert_index_equal(left.categories.sort_values(),
+                           right.categories.sort_values(),
+                           obj='{0}.categories'.format(obj))
+        assert_index_equal(left.categories.take(left.codes),
+                           right.categories.take(right.codes),
+                           obj='{0}.values'.format(obj))
 
     assert_attr_equal('ordered', left, right, obj=obj)