ENH: concat and append now can handleunordered categories

pandas-dev · Sep 6, 2016 · 589d88d · 589d88d
1 parent 8023029
commit 589d88d
Show file tree

Hide file tree

Showing 11 changed files with 653 additions and 183 deletions.
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -675,12 +675,57 @@ be lexsorted, use ``sort_categories=True`` argument.
 
     union_categoricals([a, b], sort_categories=True)
 
-.. note::
+``union_categoricals`` also works with the "easy" case of combining two
+categoricals of the same categories and order information
+(e.g. what you could also ``append`` for).
+
+.. ipython:: python
+
+    a = pd.Categorical(["a", "b"], ordered=True)
+    b = pd.Categorical(["a", "b", "a"], ordered=True)
+    union_categoricals([a, b])
+
+The below raises ``TypeError`` because the categories are ordered and not identical.
+
+.. code-block:: ipython
+
+   In [1]: a = pd.Categorical(["a", "b"], ordered=True)
+   In [2]: b = pd.Categorical(["a", "b", "c"], ordered=True)
+   In [3]: union_categoricals([a, b])
+   Out[3]:
+   TypeError: to union ordered Categoricals, all categories must be the same
+
+.. _categorical.concat:
+
+Concatenation
+~~~~~~~~~~~~~
+
+This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects<merging.concat>` for general description.
+
+By default, ``Series`` or ``DataFrame`` concatenation which contains different
+categories results in ``object`` dtype.
+
+.. ipython:: python
+
+   s1 = pd.Series(['a', 'b'], dtype='category')
+   s2 = pd.Series(['b', 'c'], dtype='category')
+   pd.concat([s1, s2])
+
+Specifying ``union_categoricals=True`` allows to concat categories following
+``union_categoricals`` rule.
+
+.. ipython:: python
+
+   pd.concat([s1, s2], union_categoricals=True)
+
+Following table summarizes the results of ``Categoricals`` related concatenations.
 
-   In addition to the "easy" case of combining two categoricals of the same
-   categories and order information (e.g. what you could also ``append`` for),
-   ``union_categoricals`` only works with unordered categoricals and will
-   raise if any are ordered.
+| arg1         | arg2                                                              | default     | ``union_categoricals=True`` |
+|---------|-------------------------------------------|---------|------------------------------|
+| category | category (identical categories) | category | category |
+| category | category (different categories, both not ordered) | object (dtype is inferred) | category |
+| category | category (different categories, either one is ordered) | object (dtype is inferred) | object (dtype is inferred) |
+| category | not category | object (dtype is inferred) | object (dtype is inferred)
 
 Getting Data In/Out
 -------------------

diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -78,34 +78,40 @@ some configurable handling of "what to do with the other axes":
 ::
 
     pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
-              keys=None, levels=None, names=None, verify_integrity=False)
+              keys=None, levels=None, names=None, verify_integrity=False,
+              union_categoricals=False, copy=True)
 
-- ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a
+- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a
   dict is passed, the sorted keys will be used as the `keys` argument, unless
   it is passed, in which case the values will be selected (see below). Any None
   objects will be dropped silently unless they are all None in which case a
   ValueError will be raised.
-- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along.
-- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on
+- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along.
+- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on
   other axis(es). Outer for union and inner for intersection.
-- ``join_axes``: list of Index objects. Specific indexes to use for the other
+- ``ignore_index`` : boolean, default False. If True, do not use the index
+  values on the concatenation axis. The resulting axis will be labeled 0, ...,
+  n - 1. This is useful if you are concatenating objects where the
+  concatenation axis does not have meaningful indexing information. Note
+  the index values on the other axes are still respected in the join.
+- ``join_axes`` : list of Index objects. Specific indexes to use for the other
   n - 1 axes instead of performing inner/outer set logic.
-- ``keys``: sequence, default None. Construct hierarchical index using the
+- ``keys`` : sequence, default None. Construct hierarchical index using the
   passed keys as the outermost level. If multiple levels passed, should
   contain tuples.
 - ``levels`` : list of sequences, default None. Specific levels (unique values)
   to use for constructing a MultiIndex. Otherwise they will be inferred from the
   keys.
-- ``names``: list, default None. Names for the levels in the resulting
+- ``names`` : list, default None. Names for the levels in the resulting
   hierarchical index.
-- ``verify_integrity``: boolean, default False. Check whether the new
+- ``verify_integrity`` : boolean, default False. Check whether the new
   concatenated axis contains duplicates. This can be very expensive relative
   to the actual data concatenation.
-- ``ignore_index`` : boolean, default False. If True, do not use the index
-  values on the concatenation axis. The resulting axis will be labeled 0, ...,
-  n - 1. This is useful if you are concatenating objects where the
-  concatenation axis does not have meaningful indexing information. Note
-  the index values on the other axes are still respected in the join.
+- ``union_categoricals`` : boolean, default False.
+  If True, use union_categoricals rule to concat category dtype.
+  If False, category dtype is kept if both categories are identical,
+  otherwise results in object dtype.
+  See :ref:`Categoricals Concatenation<categorical.concat>` for detail.
 - ``copy`` : boolean, default True. If False, do not copy data unnecessarily.
 
 Without a little bit of context and example many of these arguments don't make

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -15,6 +15,8 @@ Highlights include:
 
 - :func:`merge_asof` for asof-style time-series joining, see :ref:`here <whatsnew_0190.enhancements.asof_merge>`
 - ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
+- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here <whatsnew_0190.enhancements.read_csv_categorical>`
+- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here <whatsnew_0190.enhancements.union_categoricals>`
 - pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
 - ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here <whatsnew_0190.api.period>`
 - Sparse data structures now gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here <whatsnew_0190.sparse>`
@@ -277,6 +279,37 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification
       df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
       df['col3']
 
+.. _whatsnew_0190.enhancements.union_categoricals:
+
+Categorical Concatenation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`, :issue:`:13763`, issue:`13846`)
+
+.. ipython:: python
+
+    from pandas.types.concat import union_categoricals
+    a = pd.Categorical(["b", "c"])
+    b = pd.Categorical(["a", "b"])
+    union_categoricals([a, b])
+
+- ``concat`` and ``append`` now can concat unordered ``category`` dtypes using ``union_categorical`` internally. (:issue:`13524`)
+
+  By default, different categories results in ``object`` dtype.
+
+  .. ipython:: python
+
+    s1 = pd.Series(['a', 'b'], dtype='category')
+    s2 = pd.Series(['b', 'c'], dtype='category')
+    pd.concat([s1, s2])
+
+  Specifying ``union_categoricals=True`` allows to concat categories following
+  ``union_categoricals`` rule.
+
+  .. ipython:: python
+
+    pd.concat([s1, s2], union_categoricals=True)
+
 .. _whatsnew_0190.enhancements.semi_month_offsets:
 
 Semi-Month Offsets
@@ -448,7 +481,6 @@ Other enhancements
 - ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`)
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`, :issue:`:13763`, :issue:`13846`)
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`).
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4322,7 +4322,8 @@ def infer(x):
     # ----------------------------------------------------------------------
     # Merging / joining methods
 
-    def append(self, other, ignore_index=False, verify_integrity=False):
+    def append(self, other, ignore_index=False, verify_integrity=False,
+               union_categoricals=False):
         """
         Append rows of `other` to the end of this frame, returning a new
         object. Columns not in this frame are added as new columns.
@@ -4335,6 +4336,10 @@ def append(self, other, ignore_index=False, verify_integrity=False):
             If True, do not use the index labels.
         verify_integrity : boolean, default False
             If True, raise ValueError on creating index with duplicates.
+        union_categoricals : bool, default False
+            If True, use union_categoricals rule to concat category dtype.
+            If False, category dtype is kept if both categories are identical,
+            otherwise results in object dtype.
 
         Returns
         -------
@@ -4411,7 +4416,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
         else:
             to_concat = [self, other]
         return concat(to_concat, ignore_index=ignore_index,
-                      verify_integrity=verify_integrity)
+                      verify_integrity=verify_integrity,
+                      union_categoricals=union_categoricals)
 
     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
              sort=False):

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1144,7 +1144,7 @@ def get_result(other):
             return self._try_coerce_result(result)
 
         # error handler if we have an issue operating with the function
-        def handle_error():
+        def handle_error(detail):
 
             if raise_on_error:
                 raise TypeError('Could not operate %s with block values %s' %
@@ -1165,7 +1165,7 @@ def handle_error():
         except ValueError as detail:
             raise
         except Exception as detail:
-            result = handle_error()
+            result = handle_error(detail)
 
         # technically a broadcast error in numpy can 'work' by returning a
         # boolean False
@@ -4771,7 +4771,8 @@ def _putmask_smart(v, m, n):
     return nv
 
 
-def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
+def concatenate_block_managers(mgrs_indexers, axes, concat_axis,
+                               copy, union_categoricals=False):
     """
     Concatenate block managers into one.
 
@@ -4781,16 +4782,20 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
     axes : list of Index
     concat_axis : int
     copy : bool
+    union_categoricals : bool, default False
+            If True, use union_categoricals rule to concat CategoricalBlock.
+            If False, CategoricalBlock is kept if both categories are
+            identical, otherwise results in ObjectBlock.
 
     """
     concat_plan = combine_concat_plans(
         [get_mgr_concatenation_plan(mgr, indexers)
          for mgr, indexers in mgrs_indexers], concat_axis)
 
-    blocks = [make_block(concatenate_join_units(join_units, concat_axis,
-                                                copy=copy),
-                         placement=placement)
-              for placement, join_units in concat_plan]
+    blocks = [make_block(
+        concatenate_join_units(join_units, concat_axis, copy=copy,
+                               union_categoricals=union_categoricals),
+        placement=placement) for placement, join_units in concat_plan]
 
     return BlockManager(blocks, axes)
 
@@ -4875,7 +4880,8 @@ def get_empty_dtype_and_na(join_units):
         raise AssertionError("invalid dtype determination in get_concat_dtype")
 
 
-def concatenate_join_units(join_units, concat_axis, copy):
+def concatenate_join_units(join_units, concat_axis, copy,
+                           union_categoricals=False):
     """
     Concatenate values from several join units along selected axis.
     """
@@ -4895,7 +4901,8 @@ def concatenate_join_units(join_units, concat_axis, copy):
         if copy and concat_values.base is not None:
             concat_values = concat_values.copy()
     else:
-        concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
+        concat_values = _concat._concat_compat(
+            to_concat, axis=concat_axis, union_categoricals=union_categoricals)
 
     return concat_values
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1525,7 +1525,8 @@ def searchsorted(self, v, side='left', sorter=None):
     # -------------------------------------------------------------------
     # Combination
 
-    def append(self, to_append, ignore_index=False, verify_integrity=False):
+    def append(self, to_append, ignore_index=False, verify_integrity=False,
+               union_categoricals=False):
         """
         Concatenate two or more Series.
 
@@ -1539,6 +1540,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
 
         verify_integrity : boolean, default False
             If True, raise Exception on creating index with duplicates
+        union_categoricals : bool, default False
+            If True, use union_categoricals rule to concat category dtype.
+            If False, category dtype is kept if both categories are identical,
+            otherwise results in object dtype.
 
         Returns
         -------
@@ -1592,7 +1597,8 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
         else:
             to_concat = [self, to_append]
         return concat(to_concat, ignore_index=ignore_index,
-                      verify_integrity=verify_integrity)
+                      verify_integrity=verify_integrity,
+                      union_categoricals=union_categoricals)
 
     def _binop(self, other, func, level=None, fill_value=None):
         """

diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py
@@ -185,9 +185,9 @@ def test_concat_empty_series_dtypes(self):
                          'category')
         self.assertEqual(pd.concat([Series(dtype='category'),
                                     Series(dtype='float64')]).dtype,
-                         np.object_)
+                         'float64')
         self.assertEqual(pd.concat([Series(dtype='category'),
-                                    Series(dtype='object')]).dtype, 'category')
+                                    Series(dtype='object')]).dtype, 'object')
 
         # sparse
         result = pd.concat([Series(dtype='float64').to_sparse(), Series(