Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]API: CategoricalType for specifying categoricals #14698

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from pandas.core.algorithms import factorize, match, unique, value_counts
from pandas.types.missing import isnull, notnull
from pandas.core.categorical import Categorical
from pandas.core.categorical import Categorical, CategoricalType
from pandas.core.groupby import Grouper
from pandas.formats.format import set_eng_float_format
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
Expand Down
45 changes: 45 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2066,3 +2066,48 @@ def _factorize_from_iterables(iterables):
# For consistency, it should return a list of 2 lists.
return [[], []]
return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))


class CategoricalType(CategoricalDtype):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be in with the rest of the dtypes

to be honest i wouldn't create this; just add it optionally into the existing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that OK with the caching and stuff that is done on CategoricalDType? This is on CategoricalyDtype:

    def __new__(cls):

        try:
            return cls._cache[cls.name]
        except KeyError:
            c = object.__new__(cls)
            cls._cache[cls.name] = c
            return c

I haven't messed with extension types much. We could make the keys of that internal dict reflect the categories and ordered attributes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes you would had optional attributes and cache based on them

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

K, this seems to be working. Just have to convert the categories to a tuple so that they're hashable. Thanks.

Type for categorical data with the categories and orderedness,
but not the values

Parameters
----------
categories : list or None
ordered : bool, default False

Notes
-----
`categories=None` implies infer in whatever operation you're
doing.

Examples
--------
>>> t = CategoricalType(categories=['b', 'a'], ordered=True)
>>> s = Series(['a', 'a', 'b', 'b', 'a'])
>>> s.astype(t)
0 a
1 a
2 b
3 b
4 a
dtype: category
Categories (2, object): [b < a]
"""
dtype = 'category'
name = 'category'

def __new__(cls, categories=None, ordered=False):
self = object.__new__(cls)
self.categories = categories
self.ordered = ordered
# XXX: this is just for the repr, will move to base type
self._categorical = Categorical(None, categories=categories,
ordered=ordered)
return self

def __repr__(self):
return "<CategoricalType {}>".format(
self._categorical._repr_categories())
6 changes: 6 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
# may need to convert to categorical
# this is only called for non-categoricals
if self.is_categorical_astype(dtype):
kwargs = kwargs.copy()
categories = getattr(dtype, 'categories', None)
ordered = getattr(dtype, 'ordered', False)
# should we raise if CategoricalType and passed in kwargs?
kwargs.setdefault('categories', categories)
kwargs.setdefault('ordered', ordered)
return self.make_block(Categorical(self.values, **kwargs))

# astype processing
Expand Down
8 changes: 7 additions & 1 deletion pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from numpy import nan
import numpy as np

from pandas import Series
from pandas import Series, CategoricalType, Categorical
from pandas.tseries.index import Timestamp
from pandas.tseries.tdi import Timedelta

Expand Down Expand Up @@ -149,6 +149,12 @@ def test_astype_dict(self):
self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str})
self.assertRaises(KeyError, s.astype, {0: str})

def test_astype_categorical(self):
s = Series(['a', 'b', 'a'])
result = s.astype(CategoricalType(['a', 'b'], ordered=True))
expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
assert_series_equal(result, expected)

def test_complexx(self):
# GH4819
# complex access for ndarray compat
Expand Down