Skip to content

Commit

Permalink
Represent pandas ordered categoricals as ordinal data (#2522)
Browse files Browse the repository at this point in the history
* Represent pandas ordered categoricals as ordinal data

* Move new test to v5 from v4

* Add notes about categorical sorting to the docs

* Note that specifying the type explicitly remove the autodetection of the order

* Remove automatic sort order of categorical data if a non-ordinal type is specified

* Update altair/utils/core.py

Co-authored-by: Mattijn van Hoek <[email protected]>
  • Loading branch information
joelostblom and mattijn authored Jan 25, 2023
1 parent 1f6d1c9 commit 97ff1eb
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 18 deletions.
11 changes: 8 additions & 3 deletions altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,6 @@ def infer_vegalite_type(data):
# Otherwise, infer based on the dtype of the input
typ = infer_dtype(data)

# TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py

if typ in [
"floating",
"mixed-integer-float",
Expand All @@ -203,6 +201,8 @@ def infer_vegalite_type(data):
"complex",
]:
return "quantitative"
elif typ == "categorical" and data.cat.ordered:
return ("ordinal", data.cat.categories.tolist())
elif typ in ["string", "bytes", "categorical", "boolean", "mixed", "unicode"]:
return "nominal"
elif typ in [
Expand Down Expand Up @@ -316,8 +316,9 @@ def to_list_if_array(val):

for col_name, dtype in df.dtypes.items():
if str(dtype) == "category":
# XXXX: work around bug in to_json for categorical types
# Work around bug in to_json for categorical types in older versions of pandas
# https://github.com/pydata/pandas/issues/10778
# https://github.com/altair-viz/altair/pull/2170
col = df[col_name].astype(object)
df[col_name] = col.where(col.notnull(), None)
elif str(dtype) == "string":
Expand Down Expand Up @@ -527,6 +528,10 @@ def parse_shorthand(
if isinstance(data, pd.DataFrame) and "type" not in attrs:
if "field" in attrs and attrs["field"] in data.columns:
attrs["type"] = infer_vegalite_type(data[attrs["field"]])
# ordered categorical dataframe columns return the type and sort order as a tuple
if isinstance(attrs["type"], tuple):
attrs["sort"] = attrs["type"][1]
attrs["type"] = attrs["type"][0]
return attrs


Expand Down
8 changes: 8 additions & 0 deletions altair/utils/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,14 @@ def to_dict(self, validate=True, ignore=None, context=None):
# parsed_shorthand is removed from context if it exists so that it is
# not passed to child to_dict function calls
parsed_shorthand = context.pop("parsed_shorthand", {})
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
if "sort" in parsed_shorthand and kwds["type"] not in [
"ordinal",
Undefined,
]:
parsed_shorthand.pop("sort")

kwds.update(
{
k: v
Expand Down
12 changes: 7 additions & 5 deletions doc/user_guide/encodings/channels.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ We map the ``symbol`` variable to ``detail`` to use them to group lines.
Order
~~~~~

The `order` option and :class:`Order` channel can sort how marks are drawn on the chart.
The ``order`` option and :class:`Order` channel can sort how marks are drawn on the chart.

For stacked marks, this controls the order of components of the stack. Here, the elements of each bar are sorted alphabetically by the name of the nominal data in the color channel.

Expand Down Expand Up @@ -144,9 +144,6 @@ The order can be reversed by changing the sort option to `descending`.
order=alt.Order("site", sort="descending")
)

If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment <https://github.com/altair-viz/altair/issues/245#issuecomment-748443434>`_, although there might be edge cases where this is not fully supported. This also makes the order of the segments align with the order colors shows up in a legend that uses custom sorting for the color domain.


The same approach works for other mark types, like stacked areas charts.

.. altair-plot::
Expand All @@ -163,7 +160,12 @@ The same approach works for other mark types, like stacked areas charts.
order=alt.Order("site", sort="ascending")
)

For line marks, the `order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes.
Note that unlike the ``sort`` parameter to positional encoding channels,
the :class:`Order` channel cannot take a list of values to sort by
and is not automatically sorted when an ordered pandas categorical column is passed.
If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment <https://github.com/altair-viz/altair/issues/245#issuecomment-748443434>`_, although there might be edge cases where this is not fully supported. This workaround also makes the order of the segments align with the order that the colors shows up in a legend that uses custom sorting for the color domain.

For line marks, the :class:`Order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes.

.. altair-plot::

Expand Down
16 changes: 10 additions & 6 deletions doc/user_guide/encodings/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,18 +318,22 @@ Sort Option
~~~~~~~~~~~

Some channels accept a :class:`sort` option which determines the
order of the scale being used for the channel. There are a number of different
sort options available:
order of the scale being used for the channel.
By default the scale is sorted in ascending alphabetical order,
unless an `ordered pandas categorical column <https://pandas.pydata.org/docs/user_guide/categorical.html?highlight=categorical#sorting-and-order>`_ is passed (without an explicit type specification)
in which case Altair will use the column's inherent order to sort the scale.
There are a number of different
options available to change the sort order:

- ``sort='ascending'`` (Default) will sort the field's value in ascending order.
for string data, this uses standard alphabetical order.
For string data, this uses standard alphabetical order.
- ``sort='descending'`` will sort the field's value in descending order
- passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for
- Passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for
sorting by that channel. An optional minus prefix can be used for a descending
sort. For example ``sort='-x'`` would sort by the x channel in descending order.
- passing a list to ``sort`` allows you to explicitly set the order in which
- Passing a list to ``sort`` allows you to explicitly set the order in which
you would like the encoding to appear
- passing a :class:`EncodingSortField` class to ``sort`` allows you to sort
- Passing a :class:`EncodingSortField` class to ``sort`` allows you to sort
an axis by the value of some other field in the dataset.

Here is an example of applying these five different sort approaches on the
Expand Down
22 changes: 18 additions & 4 deletions tests/vegalite/v5/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def test_chart_infer_types():
"x": pd.date_range("2012", periods=10, freq="Y"),
"y": range(10),
"c": list("abcabcabca"),
"s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True),
}
)

Expand All @@ -134,32 +135,45 @@ def _check_encodings(chart):
assert dct["encoding"]["y"]["field"] == "y"
assert dct["encoding"]["color"]["type"] == "nominal"
assert dct["encoding"]["color"]["field"] == "c"
assert dct["encoding"]["size"]["type"] == "ordinal"
assert dct["encoding"]["size"]["field"] == "s"
assert dct["encoding"]["size"]["sort"] == [2, 1]

# Pass field names by keyword
chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c")
chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s")
_check_encodings(chart)

# pass Channel objects by keyword
chart = (
alt.Chart(data)
.mark_point()
.encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"))
.encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s"))
)
_check_encodings(chart)

# pass Channel objects by value
chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c"))
chart = (
alt.Chart(data)
.mark_point()
.encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s"))
)
_check_encodings(chart)

# override default types
chart = (
alt.Chart(data)
.mark_point()
.encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal"))
.encode(
alt.X("x", type="nominal"),
alt.Y("y", type="ordinal"),
alt.Size("s", type="nominal"),
)
)
dct = chart.to_dict()
assert dct["encoding"]["x"]["type"] == "nominal"
assert dct["encoding"]["y"]["type"] == "ordinal"
assert dct["encoding"]["size"]["type"] == "nominal"
assert "sort" not in dct["encoding"]["size"]


@pytest.mark.parametrize(
Expand Down
8 changes: 8 additions & 0 deletions tools/schemapi/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,14 @@ def to_dict(self, validate=True, ignore=None, context=None):
# parsed_shorthand is removed from context if it exists so that it is
# not passed to child to_dict function calls
parsed_shorthand = context.pop("parsed_shorthand", {})
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
if "sort" in parsed_shorthand and kwds["type"] not in [
"ordinal",
Undefined,
]:
parsed_shorthand.pop("sort")

kwds.update(
{
k: v
Expand Down

0 comments on commit 97ff1eb

Please sign in to comment.