From ccaeb76fbd6a99eb54bc855750abc6059a9ecba4 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 3 Jun 2016 21:39:06 -0500 Subject: [PATCH 1/6] API/ENH: union Categorical --- pandas/core/algorithms.py | 34 ++++++ pandas/hashtable.pyx | 200 +++++++++++++++++++++++++++++++ pandas/tests/test_categorical.py | 32 +++++ pandas/util/testing.py | 11 +- 4 files changed, 274 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4b40bce79cbb5..ee8984397f416 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -573,6 +573,34 @@ def select_n(series, n, keep, method): return dropped.iloc[inds] +def union_categoricals(to_concat): + """ + Combine list-like of Categoricals, unioning categories. All + must have the same dtype, and none can be ordered. + + Makes no guarantee about the ordering of the new categories + """ + from pandas.core.categorical import Categorical + + if any(c.ordered for c in to_concat): + raise TypeError("Can only combine unordered Categoricals") + + first = to_concat[0] + if not all(com.is_dtype_equal(c.categories, first.categories) + for c in to_concat): + raise TypeError("dtype of categories must be the same") + + new_size = sum(len(c.codes) for c in to_concat) + recode_size = max(len(c.codes) for c in to_concat) + codes = [com._ensure_int64(c.codes) for c in to_concat] + + algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner) + f, _ = algo_getter(first) + categories = [algo_getter(c)[1] for c in to_concat] + new_codes, new_categories = f(codes, categories, new_size, recode_size) + return Categorical.from_codes(new_codes, new_categories) + + def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] @@ -612,6 +640,12 @@ def _hashtable_algo(f, dtype, return_dtype=None): 'generic': (htable.PyObjectHashTable, htable.ObjectVector) } +_categorical_combiner = { + 'float64': htable.recategorize_float64, + 'int64': htable.recategorize_int64, + 'generic': htable.recategorize_object +} + def _get_data_algo(values, func_map): if com.is_float_dtype(values): diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index f718c1ab0b8da..f0aae9a778a94 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1114,6 +1114,206 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): kh_destroy_int64(table) return out +@cython.wraparound(False) +@cython.boundscheck(False) +def recategorize_int64(list codes, list cats, int N, int recode_size): + cdef: + kh_int64_t *table = kh_init_int64() + int64_t[:] new_codes = np.empty(N, dtype='int64') + int64_t[:] recode = np.empty(recode_size, dtype='int64') + int64_t[:] current_codes + int64_t[:] new_categories, current_categories + Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 + int ret = 0 + int64_t current_code = 0 + khiter_t k + + for cat_id in range(len(codes)): + current_codes = codes[cat_id] + current_categories = cats[cat_id] + + with nogil: + n_cats = current_categories.shape[0] + n_codes = current_codes.shape[0] + if cat_id == 0: + kh_resize_int64(table, n_cats) + # first pass dump directly in to table since uniqueness + # is guaranteed + for j in range(n_cats): + k = kh_put_int64(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + # reuse codes + for j in range(n_codes): + new_codes[i] = current_codes[j] + i += 1 + else: + for j in range(n_cats): + k = kh_get_int64(table, current_categories[j]) + + # if a new category, add to the master hash table + if k == table.n_buckets: + k = kh_put_int64(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + # add to the recode table, mapping from + # orig catgory -> master_category + recode[j] = table.vals[k] + + for j in range(n_codes): + # continue filing new codes, this pass + # looking up in recode table + if current_codes[j] == -1: + new_codes[i] = -1 + else: + new_codes[i] = recode[current_codes[j]] + i += 1 + + # fill in new categories from hash table + i = 0 + new_categories = np.zeros(table.n_occupied, dtype='int64') + with nogil: + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + new_categories[i] = table.keys[k] + i += 1 + kh_destroy_int64(table) + return np.asarray(new_codes), np.asarray(new_categories) + +# this could be fused with the int version +# but no great way to work with hash table +@cython.wraparound(False) +@cython.boundscheck(False) +def recategorize_float64(list codes, list cats, int N, int recode_size): + cdef: + kh_float64_t *table = kh_init_float64() + int64_t[:] new_codes = np.empty(N, dtype='int64') + int64_t[:] recode = np.empty(recode_size, dtype='int64') + int64_t[:] current_codes + float64_t[:] new_categories, current_categories + Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 + int ret = 0 + int64_t current_code = 0 + khiter_t k + + for cat_id in range(len(codes)): + current_codes = codes[cat_id] + current_categories = cats[cat_id] + + with nogil: + n_cats = current_categories.shape[0] + n_codes = current_codes.shape[0] + if cat_id == 0: + # first pass dump directly in, since uniqueness is guaranteed + # and don't need to recode + kh_resize_float64(table, n_cats) + for j in range(n_cats): + k = kh_put_float64(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + for j in range(n_codes): + new_codes[i] = current_codes[j] + i += 1 + else: + for j in range(n_cats): + k = kh_get_float64(table, current_categories[j]) + + # if a new category, add to the master hash table + if k == table.n_buckets: + k = kh_put_float64(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + + # add to the recode table, mapping from + # orig_catgory -> master_category + recode[j] = table.vals[k] + + for j in range(n_codes): + if current_codes[j] == -1: + new_codes[i] = -1 + else: + new_codes[i] = recode[current_codes[j]] + i += 1 + + # fill in new categories from hash table + i = 0 + new_categories = np.zeros(table.n_occupied, dtype='float64') + with nogil: + for k in range(table.n_buckets): + if kh_exist_float64(table, k): + new_categories[i] = table.keys[k] + i += 1 + kh_destroy_float64(table) + return np.asarray(new_codes), np.asarray(new_categories) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def recategorize_object(list codes, list cats, int N, int recode_size): + cdef: + kh_pymap_t *table = kh_init_pymap() + int64_t[:] new_codes = np.empty(N, dtype='int64') + int64_t[:] recode = np.empty(recode_size, dtype='int64') + int64_t[:] current_codes + object[:] new_categories, current_categories + Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 + int ret = 0 + int64_t current_code = 0 + khiter_t k + + for cat_id in range(len(codes)): + current_codes = codes[cat_id] + current_categories = cats[cat_id] + + n_cats = current_categories.shape[0] + n_codes = current_codes.shape[0] + if cat_id == 0: + kh_resize_pymap(table, n_cats) + # first pass dump directly in to table since uniqueness + # is guaranteed and don't need to recode + for j in range(n_cats): + k = kh_put_pymap(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + with nogil: + # reuse codes + for j in range(n_codes): + new_codes[i] = current_codes[j] + i += 1 + else: + for j in range(n_cats): + k = kh_get_pymap(table, current_categories[j]) + + # if a new category, add to the master hash table + if k == table.n_buckets: + k = kh_put_pymap(table, current_categories[j], &ret) + table.vals[k] = current_code + current_code += 1 + + # add to the recode table, mapping from + # orig catgory -> master_category + recode[j] = table.vals[k] + + with nogil: + for j in range(n_codes): + # continue filing new codes, this pass + # looking up in recode table + if current_codes[j] == -1: + new_codes[i] = -1 + else: + new_codes[i] = recode[current_codes[j]] + i += 1 + + # fill in new categories from hash table + i = 0 + new_categories = np.zeros(table.n_occupied, dtype='object') + for k in range(table.n_buckets): + if kh_exist_pymap(table, k): + new_categories[i] = table.keys[k] + i += 1 + kh_destroy_pymap(table) + return np.asarray(new_codes), np.asarray(new_categories) + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cff5bbe14f1eb..2013d6c5f9ef3 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3943,6 +3943,38 @@ def f(): 'category', categories=list('cab'))}) tm.assert_frame_equal(result, expected) + def test_union(self): + from pandas.core.algorithms import union_categoricals + + s = Categorical(list('abc')) + s2 = Categorical(list('abd')) + result = union_categoricals([s, s2]) + expected = Categorical(list('abcabd')) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + s = Categorical([0,1,2]) + s2 = Categorical([2,3,4]) + result = union_categoricals([s, s2]) + expected = Categorical([0,1,2,2,3,4]) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + s = Categorical([0,1.2,2]) + s2 = Categorical([2,3.4,4]) + result = union_categoricals([s, s2]) + expected = Categorical([0,1.2,2,2,3.4,4]) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + # can't be ordered + s = Categorical([0,1.2,2], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + # must exactly match types + s = Categorical([0,1.2,2]) + s2 = Categorical([2,3,4]) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + def test_categorical_index_preserver(self): a = Series(np.arange(6, dtype='int64')) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 03ccfcab24f58..2f0dc17897a2f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical'): + obj='Categorical', ignore_order=False): assertIsInstance(left, pd.Categorical, '[Categorical] ') assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(left.categories, right.categories, - obj='{0}.categories'.format(obj)) + if ignore_order: + assert_index_equal(left.categories.sort_values(), + right.categories.sort_values(), + obj='{0}.categories'.format(obj)) + else: + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, obj='{0}.codes'.format(obj)) From 7b37c34ba2cd61e503ee285ab6707cc0af59d27b Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 4 Jun 2016 16:27:32 -0500 Subject: [PATCH 2/6] cleanup impl, add asv --- asv_bench/benchmarks/categoricals.py | 15 ++ pandas/core/algorithms.py | 34 ----- pandas/hashtable.pyx | 200 --------------------------- pandas/tests/test_categorical.py | 2 +- pandas/types/concat.py | 40 ++++++ 5 files changed, 56 insertions(+), 235 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 244af3a577fe2..bf1e1b3f40ab0 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,8 @@ from .pandas_vb_common import * +try: + from pandas.types.concat import union_categoricals +except ImportError: + pass import string @@ -12,6 +16,17 @@ def time_concat_categorical(self): concat([self.s, self.s]) +class union_categorical(object): + goal_time = 0.2 + + def setup(self): + self.a = pd.Categorical((list('aabbcd') * 1000000)) + self.b = pd.Categorical((list('bbcdjk') * 1000000)) + + def time_union_categorical(self): + union_categoricals([self.a, self.b]) + + class categorical_value_counts(object): goal_time = 1 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ee8984397f416..4b40bce79cbb5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -573,34 +573,6 @@ def select_n(series, n, keep, method): return dropped.iloc[inds] -def union_categoricals(to_concat): - """ - Combine list-like of Categoricals, unioning categories. All - must have the same dtype, and none can be ordered. - - Makes no guarantee about the ordering of the new categories - """ - from pandas.core.categorical import Categorical - - if any(c.ordered for c in to_concat): - raise TypeError("Can only combine unordered Categoricals") - - first = to_concat[0] - if not all(com.is_dtype_equal(c.categories, first.categories) - for c in to_concat): - raise TypeError("dtype of categories must be the same") - - new_size = sum(len(c.codes) for c in to_concat) - recode_size = max(len(c.codes) for c in to_concat) - codes = [com._ensure_int64(c.codes) for c in to_concat] - - algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner) - f, _ = algo_getter(first) - categories = [algo_getter(c)[1] for c in to_concat] - new_codes, new_categories = f(codes, categories, new_size, recode_size) - return Categorical.from_codes(new_codes, new_categories) - - def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] @@ -640,12 +612,6 @@ def _hashtable_algo(f, dtype, return_dtype=None): 'generic': (htable.PyObjectHashTable, htable.ObjectVector) } -_categorical_combiner = { - 'float64': htable.recategorize_float64, - 'int64': htable.recategorize_int64, - 'generic': htable.recategorize_object -} - def _get_data_algo(values, func_map): if com.is_float_dtype(values): diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index f0aae9a778a94..f718c1ab0b8da 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1114,206 +1114,6 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): kh_destroy_int64(table) return out -@cython.wraparound(False) -@cython.boundscheck(False) -def recategorize_int64(list codes, list cats, int N, int recode_size): - cdef: - kh_int64_t *table = kh_init_int64() - int64_t[:] new_codes = np.empty(N, dtype='int64') - int64_t[:] recode = np.empty(recode_size, dtype='int64') - int64_t[:] current_codes - int64_t[:] new_categories, current_categories - Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 - int ret = 0 - int64_t current_code = 0 - khiter_t k - - for cat_id in range(len(codes)): - current_codes = codes[cat_id] - current_categories = cats[cat_id] - - with nogil: - n_cats = current_categories.shape[0] - n_codes = current_codes.shape[0] - if cat_id == 0: - kh_resize_int64(table, n_cats) - # first pass dump directly in to table since uniqueness - # is guaranteed - for j in range(n_cats): - k = kh_put_int64(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - # reuse codes - for j in range(n_codes): - new_codes[i] = current_codes[j] - i += 1 - else: - for j in range(n_cats): - k = kh_get_int64(table, current_categories[j]) - - # if a new category, add to the master hash table - if k == table.n_buckets: - k = kh_put_int64(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - # add to the recode table, mapping from - # orig catgory -> master_category - recode[j] = table.vals[k] - - for j in range(n_codes): - # continue filing new codes, this pass - # looking up in recode table - if current_codes[j] == -1: - new_codes[i] = -1 - else: - new_codes[i] = recode[current_codes[j]] - i += 1 - - # fill in new categories from hash table - i = 0 - new_categories = np.zeros(table.n_occupied, dtype='int64') - with nogil: - for k in range(table.n_buckets): - if kh_exist_int64(table, k): - new_categories[i] = table.keys[k] - i += 1 - kh_destroy_int64(table) - return np.asarray(new_codes), np.asarray(new_categories) - -# this could be fused with the int version -# but no great way to work with hash table -@cython.wraparound(False) -@cython.boundscheck(False) -def recategorize_float64(list codes, list cats, int N, int recode_size): - cdef: - kh_float64_t *table = kh_init_float64() - int64_t[:] new_codes = np.empty(N, dtype='int64') - int64_t[:] recode = np.empty(recode_size, dtype='int64') - int64_t[:] current_codes - float64_t[:] new_categories, current_categories - Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 - int ret = 0 - int64_t current_code = 0 - khiter_t k - - for cat_id in range(len(codes)): - current_codes = codes[cat_id] - current_categories = cats[cat_id] - - with nogil: - n_cats = current_categories.shape[0] - n_codes = current_codes.shape[0] - if cat_id == 0: - # first pass dump directly in, since uniqueness is guaranteed - # and don't need to recode - kh_resize_float64(table, n_cats) - for j in range(n_cats): - k = kh_put_float64(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - for j in range(n_codes): - new_codes[i] = current_codes[j] - i += 1 - else: - for j in range(n_cats): - k = kh_get_float64(table, current_categories[j]) - - # if a new category, add to the master hash table - if k == table.n_buckets: - k = kh_put_float64(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - - # add to the recode table, mapping from - # orig_catgory -> master_category - recode[j] = table.vals[k] - - for j in range(n_codes): - if current_codes[j] == -1: - new_codes[i] = -1 - else: - new_codes[i] = recode[current_codes[j]] - i += 1 - - # fill in new categories from hash table - i = 0 - new_categories = np.zeros(table.n_occupied, dtype='float64') - with nogil: - for k in range(table.n_buckets): - if kh_exist_float64(table, k): - new_categories[i] = table.keys[k] - i += 1 - kh_destroy_float64(table) - return np.asarray(new_codes), np.asarray(new_categories) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def recategorize_object(list codes, list cats, int N, int recode_size): - cdef: - kh_pymap_t *table = kh_init_pymap() - int64_t[:] new_codes = np.empty(N, dtype='int64') - int64_t[:] recode = np.empty(recode_size, dtype='int64') - int64_t[:] current_codes - object[:] new_categories, current_categories - Py_ssize_t cat_id, j, n_codes, n_cats, i = 0 - int ret = 0 - int64_t current_code = 0 - khiter_t k - - for cat_id in range(len(codes)): - current_codes = codes[cat_id] - current_categories = cats[cat_id] - - n_cats = current_categories.shape[0] - n_codes = current_codes.shape[0] - if cat_id == 0: - kh_resize_pymap(table, n_cats) - # first pass dump directly in to table since uniqueness - # is guaranteed and don't need to recode - for j in range(n_cats): - k = kh_put_pymap(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - with nogil: - # reuse codes - for j in range(n_codes): - new_codes[i] = current_codes[j] - i += 1 - else: - for j in range(n_cats): - k = kh_get_pymap(table, current_categories[j]) - - # if a new category, add to the master hash table - if k == table.n_buckets: - k = kh_put_pymap(table, current_categories[j], &ret) - table.vals[k] = current_code - current_code += 1 - - # add to the recode table, mapping from - # orig catgory -> master_category - recode[j] = table.vals[k] - - with nogil: - for j in range(n_codes): - # continue filing new codes, this pass - # looking up in recode table - if current_codes[j] == -1: - new_codes[i] = -1 - else: - new_codes[i] = recode[current_codes[j]] - i += 1 - - # fill in new categories from hash table - i = 0 - new_categories = np.zeros(table.n_occupied, dtype='object') - for k in range(table.n_buckets): - if kh_exist_pymap(table, k): - new_categories[i] = table.keys[k] - i += 1 - kh_destroy_pymap(table) - return np.asarray(new_codes), np.asarray(new_categories) - @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2013d6c5f9ef3..8096290e82666 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3944,7 +3944,7 @@ def f(): tm.assert_frame_equal(result, expected) def test_union(self): - from pandas.core.algorithms import union_categoricals + from pandas.types.concat import union_categoricals s = Categorical(list('abc')) s2 = Categorical(list('abd')) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 5cd7abb6889b7..4d6c89826bcb8 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -201,6 +201,46 @@ def convert_categorical(x): return Categorical(concatted, rawcats) +def union_categoricals(to_union): + """ + Combine list-like of Categoricals, unioning categories. All + must have the same dtype, and none can be ordered. + + Parameters + ---------- + to_union : list like of Categorical + + Returns + ------- + Categorical + A single array, categories will be ordered as they + appear in the list + """ + from pandas import Index, Categorical + + if any(c.ordered for c in to_union): + raise TypeError("Can only combine unordered Categoricals") + + first = to_union[0] + if not all(com.is_dtype_equal(c.categories, first.categories) + for c in to_union): + raise TypeError("dtype of categories must be the same") + + for i, c in enumerate(to_union): + if i == 0: + cats = c.categories.tolist() + else: + cats = cats + c.categories.difference(Index(cats)).tolist() + + cats = Index(cats) + new_codes = [] + for c in to_union: + indexer = cats.get_indexer(c.categories) + new_codes.append(indexer.take(c.codes)) + codes = np.concatenate(new_codes) + return Categorical.from_codes(codes, cats) + + def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a From 77e7963a1146ca5049bdadbba3f5a1b36c5e6c09 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 5 Jun 2016 10:13:57 -0500 Subject: [PATCH 3/6] doc notes --- doc/source/categorical.rst | 23 +++++++++++++++++++++++ doc/source/whatsnew/v0.18.2.txt | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index b518bc947c2da..6f6f82e2229ea 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -648,6 +648,29 @@ In this case the categories are not the same and so an error is raised: The same applies to ``df.append(df_different)``. +.. _categorical.union: + +Unioning +~~~~~~~~ + +If you want to combine categoricals that do not necessarily have +the same categories, the `union_categorical` function will +combine a list-like of categoricals. The new categories +will be the union of the categories being combined. + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +.. note:: + + `union_categoricals` only works with unordered categoricals + and will raise if any are orderd. + + Getting Data In/Out ------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 7493150370e9f..c45a1704e228a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -90,7 +90,7 @@ Other enhancements - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - +- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) From 4499cdad576a3ee7097c98e99959cc7d552254d7 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 6 Jun 2016 21:39:09 -0500 Subject: [PATCH 4/6] move tests, adress feedback --- pandas/tests/test_categorical.py | 32 ----------------------------- pandas/tools/tests/test_concat.py | 34 ++++++++++++++++++++++++++++++- pandas/types/concat.py | 15 ++++++-------- 3 files changed, 39 insertions(+), 42 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8096290e82666..cff5bbe14f1eb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3943,38 +3943,6 @@ def f(): 'category', categories=list('cab'))}) tm.assert_frame_equal(result, expected) - def test_union(self): - from pandas.types.concat import union_categoricals - - s = Categorical(list('abc')) - s2 = Categorical(list('abd')) - result = union_categoricals([s, s2]) - expected = Categorical(list('abcabd')) - tm.assert_categorical_equal(result, expected, ignore_order=True) - - s = Categorical([0,1,2]) - s2 = Categorical([2,3,4]) - result = union_categoricals([s, s2]) - expected = Categorical([0,1,2,2,3,4]) - tm.assert_categorical_equal(result, expected, ignore_order=True) - - s = Categorical([0,1.2,2]) - s2 = Categorical([2,3.4,4]) - result = union_categoricals([s, s2]) - expected = Categorical([0,1.2,2,2,3.4,4]) - tm.assert_categorical_equal(result, expected, ignore_order=True) - - # can't be ordered - s = Categorical([0,1.2,2], ordered=True) - with tm.assertRaises(TypeError): - union_categoricals([s, s2]) - - # must exactly match types - s = Categorical([0,1.2,2]) - s2 = Categorical([2,3,4]) - with tm.assertRaises(TypeError): - union_categoricals([s, s2]) - def test_categorical_index_preserver(self): a = Series(np.arange(6, dtype='int64')) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 9d9b0635e0f35..fa94f085c03c5 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -9,7 +9,8 @@ from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex) + DatetimeIndex, Categorical) +from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, @@ -919,6 +920,37 @@ def test_concat_keys_with_none(self): keys=['b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) + def test_union_categorical(self): + # GH 13361 + s = Categorical(list('abc')) + s2 = Categorical(list('abd')) + result = union_categoricals([s, s2]) + expected = Categorical(list('abcabd')) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + s = Categorical([0, 1, 2]) + s2 = Categorical([2, 3, 4]) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1, 2, 2, 3, 4]) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3.4, 4]) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 2, 3.4, 4]) + tm.assert_categorical_equal(result, expected, ignore_order=True) + + # can't be ordered + s = Categorical([0, 1.2, 2], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + with tm.assertRaises(TypeError): + union_categoricals([s, s2]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 4d6c89826bcb8..a3549ae3a0dff 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -216,7 +216,7 @@ def union_categoricals(to_union): A single array, categories will be ordered as they appear in the list """ - from pandas import Index, Categorical + from pandas import Index, Categorical, unique if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") @@ -226,19 +226,16 @@ def union_categoricals(to_union): for c in to_union): raise TypeError("dtype of categories must be the same") - for i, c in enumerate(to_union): - if i == 0: - cats = c.categories.tolist() - else: - cats = cats + c.categories.difference(Index(cats)).tolist() + unique_cats = unique(np.concatenate([c.categories for c in to_union])) + categories = Index(unique_cats) - cats = Index(cats) new_codes = [] for c in to_union: - indexer = cats.get_indexer(c.categories) + indexer = categories.get_indexer(c.categories) new_codes.append(indexer.take(c.codes)) codes = np.concatenate(new_codes) - return Categorical.from_codes(codes, cats) + return Categorical(codes, categories=categories, ordered=False, + fastpath=True) def _concat_datetime(to_concat, axis=0, typs=None): From 17209f92330c5e949934aec9dea039b35faf6e40 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 7 Jun 2016 18:16:26 -0500 Subject: [PATCH 5/6] Doc updates; use Index.append --- doc/source/categorical.rst | 2 +- pandas/tools/tests/test_concat.py | 48 ++++++++++++++++++++----------- pandas/types/concat.py | 17 ++++++++--- pandas/util/testing.py | 35 +++++++++++++++++----- 4 files changed, 73 insertions(+), 29 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 6f6f82e2229ea..c90453c346537 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -668,7 +668,7 @@ will be the union of the categories being combined. .. note:: `union_categoricals` only works with unordered categoricals - and will raise if any are orderd. + and will raise if any are ordered. Getting Data In/Out diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index fa94f085c03c5..84978aa1f0643 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -922,26 +922,40 @@ def test_concat_keys_with_none(self): def test_union_categorical(self): # GH 13361 - s = Categorical(list('abc')) - s2 = Categorical(list('abd')) - result = union_categoricals([s, s2]) - expected = Categorical(list('abcabd')) - tm.assert_categorical_equal(result, expected, ignore_order=True) - - s = Categorical([0, 1, 2]) - s2 = Categorical([2, 3, 4]) - result = union_categoricals([s, s2]) - expected = Categorical([0, 1, 2, 2, 3, 4]) - tm.assert_categorical_equal(result, expected, ignore_order=True) - - s = Categorical([0, 1.2, 2]) - s2 = Categorical([2, 3.4, 4]) - result = union_categoricals([s, s2]) - expected = Categorical([0, 1.2, 2, 2, 3.4, 4]) - tm.assert_categorical_equal(result, expected, ignore_order=True) + data = [ + (list('abc'), list('abd'), list('abcabd')), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + + (pd.date_range('2014-01-01', '2014-01-05'), + pd.date_range('2014-01-06', '2014-01-07'), + pd.date_range('2014-01-01', '2014-01-07')), + + (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), + pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), + pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), + + (pd.period_range('2014-01-01', '2014-01-05'), + pd.period_range('2014-01-06', '2014-01-07'), + pd.period_range('2014-01-01', '2014-01-07')), + ] + + for a, b, combined in data: + result = union_categoricals([Categorical(a), Categorical(b)]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) + + # new categories ordered by appearance + s = Categorical(['x', 'y', 'z']) + s2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([s, s2]).categories + expected = Index(['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_index_equal(result, expected) # can't be ordered s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) with tm.assertRaises(TypeError): union_categoricals([s, s2]) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index a3549ae3a0dff..688f29f58d4dc 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -206,27 +206,36 @@ def union_categoricals(to_union): Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. + .. versionadded 0.18.2 + Parameters ---------- - to_union : list like of Categorical + to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list + + Raises + ------ + TypeError + If any of the categoricals are ordered or all do not + have the same dtype """ - from pandas import Index, Categorical, unique + from pandas import Index, Categorical if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") first = to_union[0] - if not all(com.is_dtype_equal(c.categories, first.categories) + if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") - unique_cats = unique(np.concatenate([c.categories for c in to_union])) + cats = first.categories + unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 2f0dc17897a2f..d13873fcf2c84 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -963,19 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical', ignore_order=False): + obj='Categorical', check_category_order=True): + """Test that categoricals are eqivalent + + Parameters + ---------- + left, right : Categorical + Categoricals to compare + check_dtype : bool, default True + Check that integer dtype of the codes are the same + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + """ assertIsInstance(left, pd.Categorical, '[Categorical] ') assertIsInstance(right, pd.Categorical, '[Categorical] ') - if ignore_order: - assert_index_equal(left.categories.sort_values(), - right.categories.sort_values(), + if check_category_order: + assert_index_equal(left.categories, right.categories, obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, + check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) else: - assert_index_equal(left.categories, right.categories, + assert_index_equal(left.categories.sort_values(), + right.categories.sort_values(), obj='{0}.categories'.format(obj)) - assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, - obj='{0}.codes'.format(obj)) + assert_index_equal(left.categories.take(left.codes), + right.categories.take(right.codes), + obj='{0}.values'.format(obj)) assert_attr_equal('ordered', left, right, obj=obj) From 568784f51f999bd797a6a8c14bbda406aa19ff06 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 7 Jun 2016 19:57:10 -0500 Subject: [PATCH 6/6] versionadded; empty case --- doc/source/categorical.rst | 2 ++ pandas/tools/tests/test_concat.py | 3 +++ pandas/types/concat.py | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c90453c346537..e971f1f28903f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -653,6 +653,8 @@ The same applies to ``df.append(df_different)``. Unioning ~~~~~~~~ +.. versionadded:: 0.18.2 + If you want to combine categoricals that do not necessarily have the same categories, the `union_categorical` function will combine a list-like of categoricals. The new categories diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 84978aa1f0643..a8c86657a48cc 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -965,6 +965,9 @@ def test_union_categorical(self): with tm.assertRaises(TypeError): union_categoricals([s, s2]) + with tm.assertRaises(ValueError): + union_categoricals([]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 688f29f58d4dc..53db9ddf79a5c 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -223,13 +223,18 @@ def union_categoricals(to_union): TypeError If any of the categoricals are ordered or all do not have the same dtype + ValueError + Emmpty list of categoricals passed """ from pandas import Index, Categorical + if len(to_union) == 0: + raise ValueError('No Categoricals to union') + + first = to_union[0] if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") - first = to_union[0] if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same")