From ccaeb76fbd6a99eb54bc855750abc6059a9ecba4 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Fri, 3 Jun 2016 21:39:06 -0500
Subject: [PATCH 1/6] API/ENH: union Categorical

---
 pandas/core/algorithms.py        |  34 ++++++
 pandas/hashtable.pyx             | 200 +++++++++++++++++++++++++++++++
 pandas/tests/test_categorical.py |  32 +++++
 pandas/util/testing.py           |  11 +-
 4 files changed, 274 insertions(+), 3 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 4b40bce79cbb5..ee8984397f416 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -573,6 +573,34 @@ def select_n(series, n, keep, method):
     return dropped.iloc[inds]
 
 
+def union_categoricals(to_concat):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    Makes no guarantee about the ordering of the new categories
+    """
+    from pandas.core.categorical import Categorical
+
+    if any(c.ordered for c in to_concat):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    first = to_concat[0]
+    if not all(com.is_dtype_equal(c.categories, first.categories)
+               for c in to_concat):
+        raise TypeError("dtype of categories must be the same")
+
+    new_size = sum(len(c.codes) for c in to_concat)
+    recode_size = max(len(c.codes) for c in to_concat)
+    codes = [com._ensure_int64(c.codes) for c in to_concat]
+
+    algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
+    f, _ = algo_getter(first)
+    categories = [algo_getter(c)[1] for c in to_concat]
+    new_codes, new_categories = f(codes, categories, new_size, recode_size)
+    return Categorical.from_codes(new_codes, new_categories)
+
+
 def _finalize_nsmallest(arr, kth_val, n, keep, narr):
     ns, = np.nonzero(arr <= kth_val)
     inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -612,6 +640,12 @@ def _hashtable_algo(f, dtype, return_dtype=None):
     'generic': (htable.PyObjectHashTable, htable.ObjectVector)
 }
 
+_categorical_combiner = {
+    'float64': htable.recategorize_float64,
+    'int64': htable.recategorize_int64,
+    'generic': htable.recategorize_object
+}
+
 
 def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index f718c1ab0b8da..f0aae9a778a94 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -1114,6 +1114,206 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     kh_destroy_int64(table)
     return out
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_int64(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_int64_t *table = kh_init_int64()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        int64_t[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        with nogil:
+            n_cats = current_categories.shape[0]
+            n_codes = current_codes.shape[0]
+            if cat_id == 0:
+                kh_resize_int64(table, n_cats)
+                # first pass dump directly in to table since uniqueness
+                # is guaranteed
+                for j in range(n_cats):
+                    k = kh_put_int64(table, current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+                # reuse codes
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+            else:
+                for j in range(n_cats):
+                    k = kh_get_int64(table, current_categories[j])
+
+                    # if a new category, add to the master hash table
+                    if k == table.n_buckets:
+                        k = kh_put_int64(table, current_categories[j], &ret)
+                        table.vals[k] = current_code
+                        current_code += 1
+                    # add to the recode table, mapping from
+                    # orig catgory -> master_category
+                    recode[j] = table.vals[k]
+
+                for j in range(n_codes):
+                    # continue filing new codes, this pass
+                    # looking up in recode table
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='int64')
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_int64(table, k):
+                new_categories[i] = table.keys[k]
+                i += 1
+        kh_destroy_int64(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
+# this could be fused with the int version
+# but no great way to work with hash table
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_float64(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_float64_t *table = kh_init_float64()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        float64_t[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        with nogil:
+            n_cats = current_categories.shape[0]
+            n_codes = current_codes.shape[0]
+            if cat_id == 0:
+                # first pass dump directly in, since uniqueness is guaranteed
+                # and don't need to recode
+                kh_resize_float64(table, n_cats)
+                for j in range(n_cats):
+                    k = kh_put_float64(table, current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+            else:
+                for j in range(n_cats):
+                    k = kh_get_float64(table, current_categories[j])
+
+                    # if a new category, add to the master hash table
+                    if k == table.n_buckets:
+                        k = kh_put_float64(table, current_categories[j], &ret)
+                        table.vals[k] = current_code
+                        current_code += 1
+
+                    # add to the recode table, mapping from
+                    # orig_catgory -> master_category
+                    recode[j] = table.vals[k]
+
+                for j in range(n_codes):
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='float64')
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_float64(table, k):
+                new_categories[i] = table.keys[k]
+                i += 1
+        kh_destroy_float64(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_object(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_pymap_t *table = kh_init_pymap()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        object[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        n_cats = current_categories.shape[0]
+        n_codes = current_codes.shape[0]
+        if cat_id == 0:
+            kh_resize_pymap(table, n_cats)
+            # first pass dump directly in to table since uniqueness
+            # is guaranteed and don't need to recode
+            for j in range(n_cats):
+                k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
+                table.vals[k] = current_code
+                current_code += 1
+            with nogil:
+                # reuse codes
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+        else:
+            for j in range(n_cats):
+                k = kh_get_pymap(table, <PyObject*>current_categories[j])
+
+                # if a new category, add to the master hash table
+                if k == table.n_buckets:
+                    k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+
+                # add to the recode table, mapping from
+                # orig catgory -> master_category
+                recode[j] = table.vals[k]
+
+            with nogil:
+                for j in range(n_codes):
+                    # continue filing new codes, this pass
+                    # looking up in recode table
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='object')
+    for k in range(table.n_buckets):
+        if kh_exist_pymap(table, k):
+            new_categories[i] = <object>table.keys[k]
+            i += 1
+    kh_destroy_pymap(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index cff5bbe14f1eb..2013d6c5f9ef3 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -3943,6 +3943,38 @@ def f():
                                   'category', categories=list('cab'))})
         tm.assert_frame_equal(result, expected)
 
+    def test_union(self):
+        from pandas.core.algorithms import union_categoricals
+
+        s = Categorical(list('abc'))
+        s2 = Categorical(list('abd'))
+        result = union_categoricals([s, s2])
+        expected = Categorical(list('abcabd'))
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1,2])
+        s2 = Categorical([2,3,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1,2,2,3,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3.4,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1.2,2,2,3.4,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        # can't be ordered
+        s = Categorical([0,1.2,2], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        # must exactly match types
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3,4])
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
     def test_categorical_index_preserver(self):
 
         a = Series(np.arange(6, dtype='int64'))
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 03ccfcab24f58..2f0dc17897a2f 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''):
 
 
 def assert_categorical_equal(left, right, check_dtype=True,
-                             obj='Categorical'):
+                             obj='Categorical', ignore_order=False):
     assertIsInstance(left, pd.Categorical, '[Categorical] ')
     assertIsInstance(right, pd.Categorical, '[Categorical] ')
 
-    assert_index_equal(left.categories, right.categories,
-                       obj='{0}.categories'.format(obj))
+    if ignore_order:
+        assert_index_equal(left.categories.sort_values(),
+                           right.categories.sort_values(),
+                           obj='{0}.categories'.format(obj))
+    else:
+        assert_index_equal(left.categories, right.categories,
+                           obj='{0}.categories'.format(obj))
     assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
                              obj='{0}.codes'.format(obj))
 

From 7b37c34ba2cd61e503ee285ab6707cc0af59d27b Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sat, 4 Jun 2016 16:27:32 -0500
Subject: [PATCH 2/6] cleanup impl, add asv

---
 asv_bench/benchmarks/categoricals.py |  15 ++
 pandas/core/algorithms.py            |  34 -----
 pandas/hashtable.pyx                 | 200 ---------------------------
 pandas/tests/test_categorical.py     |   2 +-
 pandas/types/concat.py               |  40 ++++++
 5 files changed, 56 insertions(+), 235 deletions(-)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index 244af3a577fe2..bf1e1b3f40ab0 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -1,4 +1,8 @@
 from .pandas_vb_common import *
+try:
+    from pandas.types.concat import union_categoricals
+except ImportError:
+    pass
 import string
 
 
@@ -12,6 +16,17 @@ def time_concat_categorical(self):
         concat([self.s, self.s])
 
 
+class union_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.a = pd.Categorical((list('aabbcd') * 1000000))
+        self.b = pd.Categorical((list('bbcdjk') * 1000000))
+
+    def time_union_categorical(self):
+        union_categoricals([self.a, self.b])
+
+
 class categorical_value_counts(object):
     goal_time = 1
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index ee8984397f416..4b40bce79cbb5 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -573,34 +573,6 @@ def select_n(series, n, keep, method):
     return dropped.iloc[inds]
 
 
-def union_categoricals(to_concat):
-    """
-    Combine list-like of Categoricals, unioning categories. All
-    must have the same dtype, and none can be ordered.
-
-    Makes no guarantee about the ordering of the new categories
-    """
-    from pandas.core.categorical import Categorical
-
-    if any(c.ordered for c in to_concat):
-        raise TypeError("Can only combine unordered Categoricals")
-
-    first = to_concat[0]
-    if not all(com.is_dtype_equal(c.categories, first.categories)
-               for c in to_concat):
-        raise TypeError("dtype of categories must be the same")
-
-    new_size = sum(len(c.codes) for c in to_concat)
-    recode_size = max(len(c.codes) for c in to_concat)
-    codes = [com._ensure_int64(c.codes) for c in to_concat]
-
-    algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
-    f, _ = algo_getter(first)
-    categories = [algo_getter(c)[1] for c in to_concat]
-    new_codes, new_categories = f(codes, categories, new_size, recode_size)
-    return Categorical.from_codes(new_codes, new_categories)
-
-
 def _finalize_nsmallest(arr, kth_val, n, keep, narr):
     ns, = np.nonzero(arr <= kth_val)
     inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -640,12 +612,6 @@ def _hashtable_algo(f, dtype, return_dtype=None):
     'generic': (htable.PyObjectHashTable, htable.ObjectVector)
 }
 
-_categorical_combiner = {
-    'float64': htable.recategorize_float64,
-    'int64': htable.recategorize_int64,
-    'generic': htable.recategorize_object
-}
-
 
 def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index f0aae9a778a94..f718c1ab0b8da 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -1114,206 +1114,6 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     kh_destroy_int64(table)
     return out
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_int64(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_int64_t *table = kh_init_int64()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        int64_t[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        with nogil:
-            n_cats = current_categories.shape[0]
-            n_codes = current_codes.shape[0]
-            if cat_id == 0:
-                kh_resize_int64(table, n_cats)
-                # first pass dump directly in to table since uniqueness
-                # is guaranteed
-                for j in range(n_cats):
-                    k = kh_put_int64(table, current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-                # reuse codes
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-            else:
-                for j in range(n_cats):
-                    k = kh_get_int64(table, current_categories[j])
-
-                    # if a new category, add to the master hash table
-                    if k == table.n_buckets:
-                        k = kh_put_int64(table, current_categories[j], &ret)
-                        table.vals[k] = current_code
-                        current_code += 1
-                    # add to the recode table, mapping from
-                    # orig catgory -> master_category
-                    recode[j] = table.vals[k]
-
-                for j in range(n_codes):
-                    # continue filing new codes, this pass
-                    # looking up in recode table
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='int64')
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                new_categories[i] = table.keys[k]
-                i += 1
-        kh_destroy_int64(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
-# this could be fused with the int version
-# but no great way to work with hash table
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_float64(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_float64_t *table = kh_init_float64()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        float64_t[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        with nogil:
-            n_cats = current_categories.shape[0]
-            n_codes = current_codes.shape[0]
-            if cat_id == 0:
-                # first pass dump directly in, since uniqueness is guaranteed
-                # and don't need to recode
-                kh_resize_float64(table, n_cats)
-                for j in range(n_cats):
-                    k = kh_put_float64(table, current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-            else:
-                for j in range(n_cats):
-                    k = kh_get_float64(table, current_categories[j])
-
-                    # if a new category, add to the master hash table
-                    if k == table.n_buckets:
-                        k = kh_put_float64(table, current_categories[j], &ret)
-                        table.vals[k] = current_code
-                        current_code += 1
-
-                    # add to the recode table, mapping from
-                    # orig_catgory -> master_category
-                    recode[j] = table.vals[k]
-
-                for j in range(n_codes):
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='float64')
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_float64(table, k):
-                new_categories[i] = table.keys[k]
-                i += 1
-        kh_destroy_float64(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_object(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_pymap_t *table = kh_init_pymap()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        object[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        n_cats = current_categories.shape[0]
-        n_codes = current_codes.shape[0]
-        if cat_id == 0:
-            kh_resize_pymap(table, n_cats)
-            # first pass dump directly in to table since uniqueness
-            # is guaranteed and don't need to recode
-            for j in range(n_cats):
-                k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
-                table.vals[k] = current_code
-                current_code += 1
-            with nogil:
-                # reuse codes
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-        else:
-            for j in range(n_cats):
-                k = kh_get_pymap(table, <PyObject*>current_categories[j])
-
-                # if a new category, add to the master hash table
-                if k == table.n_buckets:
-                    k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-
-                # add to the recode table, mapping from
-                # orig catgory -> master_category
-                recode[j] = table.vals[k]
-
-            with nogil:
-                for j in range(n_codes):
-                    # continue filing new codes, this pass
-                    # looking up in recode table
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='object')
-    for k in range(table.n_buckets):
-        if kh_exist_pymap(table, k):
-            new_categories[i] = <object>table.keys[k]
-            i += 1
-    kh_destroy_pymap(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 2013d6c5f9ef3..8096290e82666 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -3944,7 +3944,7 @@ def f():
         tm.assert_frame_equal(result, expected)
 
     def test_union(self):
-        from pandas.core.algorithms import union_categoricals
+        from pandas.types.concat import union_categoricals
 
         s = Categorical(list('abc'))
         s2 = Categorical(list('abd'))
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index 5cd7abb6889b7..4d6c89826bcb8 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -201,6 +201,46 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
+def union_categoricals(to_union):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    Parameters
+    ----------
+    to_union : list like of Categorical
+
+    Returns
+    -------
+    Categorical
+       A single array, categories will be ordered as they
+       appear in the list
+    """
+    from pandas import Index, Categorical
+
+    if any(c.ordered for c in to_union):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    first = to_union[0]
+    if not all(com.is_dtype_equal(c.categories, first.categories)
+               for c in to_union):
+        raise TypeError("dtype of categories must be the same")
+
+    for i, c in enumerate(to_union):
+        if i == 0:
+            cats = c.categories.tolist()
+        else:
+            cats = cats + c.categories.difference(Index(cats)).tolist()
+
+    cats = Index(cats)
+    new_codes = []
+    for c in to_union:
+        indexer = cats.get_indexer(c.categories)
+        new_codes.append(indexer.take(c.codes))
+    codes = np.concatenate(new_codes)
+    return Categorical.from_codes(codes, cats)
+
+
 def _concat_datetime(to_concat, axis=0, typs=None):
     """
     provide concatenation of an datetimelike array of arrays each of which is a

From 77e7963a1146ca5049bdadbba3f5a1b36c5e6c09 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Sun, 5 Jun 2016 10:13:57 -0500
Subject: [PATCH 3/6] doc notes

---
 doc/source/categorical.rst      | 23 +++++++++++++++++++++++
 doc/source/whatsnew/v0.18.2.txt |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index b518bc947c2da..6f6f82e2229ea 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -648,6 +648,29 @@ In this case the categories are not the same and so an error is raised:
 
 The same applies to ``df.append(df_different)``.
 
+.. _categorical.union:
+
+Unioning
+~~~~~~~~
+
+If you want to combine categoricals that do not necessarily have
+the same categories, the `union_categorical` function will
+combine a list-like of categoricals. The new categories
+will be the union of the categories being combined.
+
+.. ipython:: python
+
+    from pandas.types.concat import union_categoricals
+    a = pd.Categorical(["b", "c"])
+    b = pd.Categorical(["a", "b"])
+    union_categoricals([a, b])
+
+.. note::
+
+   `union_categoricals` only works with unordered categoricals
+   and will raise if any are orderd.
+
+
 Getting Data In/Out
 -------------------
 
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
index 7493150370e9f..c45a1704e228a 100644
--- a/doc/source/whatsnew/v0.18.2.txt
+++ b/doc/source/whatsnew/v0.18.2.txt
@@ -90,7 +90,7 @@ Other enhancements
 
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-
+- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
 - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules.  New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 

From 4499cdad576a3ee7097c98e99959cc7d552254d7 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Mon, 6 Jun 2016 21:39:09 -0500
Subject: [PATCH 4/6] move tests, adress feedback

---
 pandas/tests/test_categorical.py  | 32 -----------------------------
 pandas/tools/tests/test_concat.py | 34 ++++++++++++++++++++++++++++++-
 pandas/types/concat.py            | 15 ++++++--------
 3 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 8096290e82666..cff5bbe14f1eb 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -3943,38 +3943,6 @@ def f():
                                   'category', categories=list('cab'))})
         tm.assert_frame_equal(result, expected)
 
-    def test_union(self):
-        from pandas.types.concat import union_categoricals
-
-        s = Categorical(list('abc'))
-        s2 = Categorical(list('abd'))
-        result = union_categoricals([s, s2])
-        expected = Categorical(list('abcabd'))
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
-
-        s = Categorical([0,1,2])
-        s2 = Categorical([2,3,4])
-        result = union_categoricals([s, s2])
-        expected = Categorical([0,1,2,2,3,4])
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
-
-        s = Categorical([0,1.2,2])
-        s2 = Categorical([2,3.4,4])
-        result = union_categoricals([s, s2])
-        expected = Categorical([0,1.2,2,2,3.4,4])
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
-
-        # can't be ordered
-        s = Categorical([0,1.2,2], ordered=True)
-        with tm.assertRaises(TypeError):
-            union_categoricals([s, s2])
-
-        # must exactly match types
-        s = Categorical([0,1.2,2])
-        s2 = Categorical([2,3,4])
-        with tm.assertRaises(TypeError):
-            union_categoricals([s, s2])
-
     def test_categorical_index_preserver(self):
 
         a = Series(np.arange(6, dtype='int64'))
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index 9d9b0635e0f35..fa94f085c03c5 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -9,7 +9,8 @@
 from pandas import (DataFrame, concat,
                     read_csv, isnull, Series, date_range,
                     Index, Panel, MultiIndex, Timestamp,
-                    DatetimeIndex)
+                    DatetimeIndex, Categorical)
+from pandas.types.concat import union_categoricals
 from pandas.util import testing as tm
 from pandas.util.testing import (assert_frame_equal,
                                  makeCustomDataframe as mkdf,
@@ -919,6 +920,37 @@ def test_concat_keys_with_none(self):
                           keys=['b', 'c', 'd', 'e'])
         tm.assert_frame_equal(result, expected)
 
+    def test_union_categorical(self):
+        # GH 13361
+        s = Categorical(list('abc'))
+        s2 = Categorical(list('abd'))
+        result = union_categoricals([s, s2])
+        expected = Categorical(list('abcabd'))
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0, 1, 2])
+        s2 = Categorical([2, 3, 4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0, 1, 2, 2, 3, 4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0, 1.2, 2])
+        s2 = Categorical([2, 3.4, 4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0, 1.2, 2, 2, 3.4, 4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        # can't be ordered
+        s = Categorical([0, 1.2, 2], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        # must exactly match types
+        s = Categorical([0, 1.2, 2])
+        s2 = Categorical([2, 3, 4])
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index 4d6c89826bcb8..a3549ae3a0dff 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -216,7 +216,7 @@ def union_categoricals(to_union):
        A single array, categories will be ordered as they
        appear in the list
     """
-    from pandas import Index, Categorical
+    from pandas import Index, Categorical, unique
 
     if any(c.ordered for c in to_union):
         raise TypeError("Can only combine unordered Categoricals")
@@ -226,19 +226,16 @@ def union_categoricals(to_union):
                for c in to_union):
         raise TypeError("dtype of categories must be the same")
 
-    for i, c in enumerate(to_union):
-        if i == 0:
-            cats = c.categories.tolist()
-        else:
-            cats = cats + c.categories.difference(Index(cats)).tolist()
+    unique_cats = unique(np.concatenate([c.categories for c in to_union]))
+    categories = Index(unique_cats)
 
-    cats = Index(cats)
     new_codes = []
     for c in to_union:
-        indexer = cats.get_indexer(c.categories)
+        indexer = categories.get_indexer(c.categories)
         new_codes.append(indexer.take(c.codes))
     codes = np.concatenate(new_codes)
-    return Categorical.from_codes(codes, cats)
+    return Categorical(codes, categories=categories, ordered=False,
+                       fastpath=True)
 
 
 def _concat_datetime(to_concat, axis=0, typs=None):

From 17209f92330c5e949934aec9dea039b35faf6e40 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 7 Jun 2016 18:16:26 -0500
Subject: [PATCH 5/6] Doc updates; use Index.append

---
 doc/source/categorical.rst        |  2 +-
 pandas/tools/tests/test_concat.py | 48 ++++++++++++++++++++-----------
 pandas/types/concat.py            | 17 ++++++++---
 pandas/util/testing.py            | 35 +++++++++++++++++-----
 4 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 6f6f82e2229ea..c90453c346537 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -668,7 +668,7 @@ will be the union of the categories being combined.
 .. note::
 
    `union_categoricals` only works with unordered categoricals
-   and will raise if any are orderd.
+   and will raise if any are ordered.
 
 
 Getting Data In/Out
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index fa94f085c03c5..84978aa1f0643 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -922,26 +922,40 @@ def test_concat_keys_with_none(self):
 
     def test_union_categorical(self):
         # GH 13361
-        s = Categorical(list('abc'))
-        s2 = Categorical(list('abd'))
-        result = union_categoricals([s, s2])
-        expected = Categorical(list('abcabd'))
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
-
-        s = Categorical([0, 1, 2])
-        s2 = Categorical([2, 3, 4])
-        result = union_categoricals([s, s2])
-        expected = Categorical([0, 1, 2, 2, 3, 4])
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
-
-        s = Categorical([0, 1.2, 2])
-        s2 = Categorical([2, 3.4, 4])
-        result = union_categoricals([s, s2])
-        expected = Categorical([0, 1.2, 2, 2, 3.4, 4])
-        tm.assert_categorical_equal(result, expected, ignore_order=True)
+        data = [
+            (list('abc'), list('abd'), list('abcabd')),
+            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
+            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
+
+            (pd.date_range('2014-01-01', '2014-01-05'),
+             pd.date_range('2014-01-06', '2014-01-07'),
+             pd.date_range('2014-01-01', '2014-01-07')),
+
+            (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
+             pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
+             pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
+
+            (pd.period_range('2014-01-01', '2014-01-05'),
+             pd.period_range('2014-01-06', '2014-01-07'),
+             pd.period_range('2014-01-01', '2014-01-07')),
+        ]
+
+        for a, b, combined in data:
+            result = union_categoricals([Categorical(a), Categorical(b)])
+            expected = Categorical(combined)
+            tm.assert_categorical_equal(result, expected,
+                                        check_category_order=True)
+
+        # new categories ordered by appearance
+        s = Categorical(['x', 'y', 'z'])
+        s2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([s, s2]).categories
+        expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
+        tm.assert_index_equal(result, expected)
 
         # can't be ordered
         s = Categorical([0, 1.2, 2], ordered=True)
+        s2 = Categorical([0, 1.2, 2], ordered=True)
         with tm.assertRaises(TypeError):
             union_categoricals([s, s2])
 
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index a3549ae3a0dff..688f29f58d4dc 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -206,27 +206,36 @@ def union_categoricals(to_union):
     Combine list-like of Categoricals, unioning categories. All
     must have the same dtype, and none can be ordered.
 
+    .. versionadded 0.18.2
+
     Parameters
     ----------
-    to_union : list like of Categorical
+    to_union : list-like of Categoricals
 
     Returns
     -------
     Categorical
        A single array, categories will be ordered as they
        appear in the list
+
+    Raises
+    ------
+    TypeError
+        If any of the categoricals are ordered or all do not
+        have the same dtype
     """
-    from pandas import Index, Categorical, unique
+    from pandas import Index, Categorical
 
     if any(c.ordered for c in to_union):
         raise TypeError("Can only combine unordered Categoricals")
 
     first = to_union[0]
-    if not all(com.is_dtype_equal(c.categories, first.categories)
+    if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
                for c in to_union):
         raise TypeError("dtype of categories must be the same")
 
-    unique_cats = unique(np.concatenate([c.categories for c in to_union]))
+    cats = first.categories
+    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
     categories = Index(unique_cats)
 
     new_codes = []
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 2f0dc17897a2f..d13873fcf2c84 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -963,19 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''):
 
 
 def assert_categorical_equal(left, right, check_dtype=True,
-                             obj='Categorical', ignore_order=False):
+                             obj='Categorical', check_category_order=True):
+    """Test that categoricals are eqivalent
+
+    Parameters
+    ----------
+    left, right : Categorical
+        Categoricals to compare
+    check_dtype : bool, default True
+        Check that integer dtype of the codes are the same
+    obj : str, default 'Categorical'
+        Specify object name being compared, internally used to show appropriate
+        assertion message
+    check_category_order : bool, default True
+        Whether the order of the categories should be compared, which
+        implies identical integer codes.  If False, only the resulting
+        values are compared.  The ordered attribute is
+        checked regardless.
+    """
     assertIsInstance(left, pd.Categorical, '[Categorical] ')
     assertIsInstance(right, pd.Categorical, '[Categorical] ')
 
-    if ignore_order:
-        assert_index_equal(left.categories.sort_values(),
-                           right.categories.sort_values(),
+    if check_category_order:
+        assert_index_equal(left.categories, right.categories,
                            obj='{0}.categories'.format(obj))
+        assert_numpy_array_equal(left.codes, right.codes,
+                                 check_dtype=check_dtype,
+                                 obj='{0}.codes'.format(obj))
     else:
-        assert_index_equal(left.categories, right.categories,
+        assert_index_equal(left.categories.sort_values(),
+                           right.categories.sort_values(),
                            obj='{0}.categories'.format(obj))
-    assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
-                             obj='{0}.codes'.format(obj))
+        assert_index_equal(left.categories.take(left.codes),
+                           right.categories.take(right.codes),
+                           obj='{0}.values'.format(obj))
 
     assert_attr_equal('ordered', left, right, obj=obj)
 

From 568784f51f999bd797a6a8c14bbda406aa19ff06 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 7 Jun 2016 19:57:10 -0500
Subject: [PATCH 6/6] versionadded; empty case

---
 doc/source/categorical.rst        | 2 ++
 pandas/tools/tests/test_concat.py | 3 +++
 pandas/types/concat.py            | 7 ++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index c90453c346537..e971f1f28903f 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -653,6 +653,8 @@ The same applies to ``df.append(df_different)``.
 Unioning
 ~~~~~~~~
 
+.. versionadded:: 0.18.2
+
 If you want to combine categoricals that do not necessarily have
 the same categories, the `union_categorical` function will
 combine a list-like of categoricals. The new categories
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index 84978aa1f0643..a8c86657a48cc 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -965,6 +965,9 @@ def test_union_categorical(self):
         with tm.assertRaises(TypeError):
             union_categoricals([s, s2])
 
+        with tm.assertRaises(ValueError):
+            union_categoricals([])
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index 688f29f58d4dc..53db9ddf79a5c 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -223,13 +223,18 @@ def union_categoricals(to_union):
     TypeError
         If any of the categoricals are ordered or all do not
         have the same dtype
+    ValueError
+        Emmpty list of categoricals passed
     """
     from pandas import Index, Categorical
 
+    if len(to_union) == 0:
+        raise ValueError('No Categoricals to union')
+
+    first = to_union[0]
     if any(c.ordered for c in to_union):
         raise TypeError("Can only combine unordered Categoricals")
 
-    first = to_union[0]
     if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
                for c in to_union):
         raise TypeError("dtype of categories must be the same")