Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Represent pandas ordered categoricals as ordinal data #2522

Merged
merged 7 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,6 @@ def infer_vegalite_type(data):
# Otherwise, infer based on the dtype of the input
typ = infer_dtype(data)

# TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this refers to ordinal data, but I could not find these specific tests, but I updated the one test that seemed relevant.


if typ in [
"floating",
"mixed-integer-float",
Expand All @@ -203,6 +201,8 @@ def infer_vegalite_type(data):
"complex",
]:
return "quantitative"
elif typ == "categorical" and data.cat.ordered:
return ("ordinal", data.cat.categories.tolist())
elif typ in ["string", "bytes", "categorical", "boolean", "mixed", "unicode"]:
return "nominal"
elif typ in [
Expand Down Expand Up @@ -316,8 +316,9 @@ def to_list_if_array(val):

for col_name, dtype in df.dtypes.items():
if str(dtype) == "category":
# XXXX: work around bug in to_json for categorical types
# Work around bug in to_json for categorical types in older versions of pandas
# https://github.com/pydata/pandas/issues/10778
# https://github.com/altair-viz/altair/pull/2170
col = df[col_name].astype(object)
df[col_name] = col.where(col.notnull(), None)
elif str(dtype) == "string":
Expand Down Expand Up @@ -527,6 +528,10 @@ def parse_shorthand(
if isinstance(data, pd.DataFrame) and "type" not in attrs:
if "field" in attrs and attrs["field"] in data.columns:
attrs["type"] = infer_vegalite_type(data[attrs["field"]])
# ordered categorical dataframe columns return the type and sort order as a tuple
if isinstance(attrs["type"], tuple):
attrs["sort"] = attrs["type"][1]
attrs["type"] = attrs["type"][0]
return attrs


Expand Down
8 changes: 8 additions & 0 deletions altair/utils/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,14 @@ def to_dict(self, validate=True, ignore=None, context=None):
# parsed_shorthand is removed from context if it exists so that it is
# not passed to child to_dict function calls
parsed_shorthand = context.pop("parsed_shorthand", {})
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
if "sort" in parsed_shorthand and kwds["type"] not in [
"ordinal",
Undefined,
]:
parsed_shorthand.pop("sort")

kwds.update(
{
k: v
Expand Down
12 changes: 7 additions & 5 deletions doc/user_guide/encodings/channels.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ We map the ``symbol`` variable to ``detail`` to use them to group lines.
Order
~~~~~

The `order` option and :class:`Order` channel can sort how marks are drawn on the chart.
The ``order`` option and :class:`Order` channel can sort how marks are drawn on the chart.

For stacked marks, this controls the order of components of the stack. Here, the elements of each bar are sorted alphabetically by the name of the nominal data in the color channel.

Expand Down Expand Up @@ -144,9 +144,6 @@ The order can be reversed by changing the sort option to `descending`.
order=alt.Order("site", sort="descending")
)

If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment <https://github.com/altair-viz/altair/issues/245#issuecomment-748443434>`_, although there might be edge cases where this is not fully supported. This also makes the order of the segments align with the order colors shows up in a legend that uses custom sorting for the color domain.


The same approach works for other mark types, like stacked areas charts.

.. altair-plot::
Expand All @@ -163,7 +160,12 @@ The same approach works for other mark types, like stacked areas charts.
order=alt.Order("site", sort="ascending")
)

For line marks, the `order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes.
Note that unlike the ``sort`` parameter to positional encoding channels,
the :class:`Order` channel cannot take a list of values to sort by
and is not automatically sorted when an ordered pandas categorical column is passed.
If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment <https://github.com/altair-viz/altair/issues/245#issuecomment-748443434>`_, although there might be edge cases where this is not fully supported. This workaround also makes the order of the segments align with the order that the colors shows up in a legend that uses custom sorting for the color domain.

For line marks, the :class:`Order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes.

.. altair-plot::

Expand Down
16 changes: 10 additions & 6 deletions doc/user_guide/encodings/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,18 +318,22 @@ Sort Option
~~~~~~~~~~~

Some channels accept a :class:`sort` option which determines the
order of the scale being used for the channel. There are a number of different
sort options available:
order of the scale being used for the channel.
By default the scale is sorted in ascending alphabetical order,
unless an `ordered pandas categorical column <https://pandas.pydata.org/docs/user_guide/categorical.html?highlight=categorical#sorting-and-order>`_ is passed (without an explicit type specification)
in which case Altair will use the column's inherent order to sort the scale.
There are a number of different
options available to change the sort order:

- ``sort='ascending'`` (Default) will sort the field's value in ascending order.
for string data, this uses standard alphabetical order.
For string data, this uses standard alphabetical order.
- ``sort='descending'`` will sort the field's value in descending order
- passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for
- Passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for
sorting by that channel. An optional minus prefix can be used for a descending
sort. For example ``sort='-x'`` would sort by the x channel in descending order.
- passing a list to ``sort`` allows you to explicitly set the order in which
- Passing a list to ``sort`` allows you to explicitly set the order in which
you would like the encoding to appear
- passing a :class:`EncodingSortField` class to ``sort`` allows you to sort
- Passing a :class:`EncodingSortField` class to ``sort`` allows you to sort
an axis by the value of some other field in the dataset.

Here is an example of applying these five different sort approaches on the
Expand Down
22 changes: 18 additions & 4 deletions tests/vegalite/v5/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def test_chart_infer_types():
"x": pd.date_range("2012", periods=10, freq="Y"),
"y": range(10),
"c": list("abcabcabca"),
"s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True),
}
)

Expand All @@ -134,32 +135,45 @@ def _check_encodings(chart):
assert dct["encoding"]["y"]["field"] == "y"
assert dct["encoding"]["color"]["type"] == "nominal"
assert dct["encoding"]["color"]["field"] == "c"
assert dct["encoding"]["size"]["type"] == "ordinal"
assert dct["encoding"]["size"]["field"] == "s"
assert dct["encoding"]["size"]["sort"] == [2, 1]

# Pass field names by keyword
chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c")
chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s")
_check_encodings(chart)

# pass Channel objects by keyword
chart = (
alt.Chart(data)
.mark_point()
.encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"))
.encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s"))
)
_check_encodings(chart)

# pass Channel objects by value
chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c"))
chart = (
alt.Chart(data)
.mark_point()
.encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s"))
)
_check_encodings(chart)

# override default types
chart = (
alt.Chart(data)
.mark_point()
.encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal"))
.encode(
alt.X("x", type="nominal"),
alt.Y("y", type="ordinal"),
alt.Size("s", type="nominal"),
)
)
dct = chart.to_dict()
assert dct["encoding"]["x"]["type"] == "nominal"
assert dct["encoding"]["y"]["type"] == "ordinal"
assert dct["encoding"]["size"]["type"] == "nominal"
assert "sort" not in dct["encoding"]["size"]


@pytest.mark.parametrize(
Expand Down
8 changes: 8 additions & 0 deletions tools/schemapi/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,14 @@ def to_dict(self, validate=True, ignore=None, context=None):
# parsed_shorthand is removed from context if it exists so that it is
# not passed to child to_dict function calls
parsed_shorthand = context.pop("parsed_shorthand", {})
# Prevent that pandas categorical data is automatically sorted
# when a non-ordinal data type is specifed manually
if "sort" in parsed_shorthand and kwds["type"] not in [
"ordinal",
Undefined,
]:
parsed_shorthand.pop("sort")

kwds.update(
{
k: v
Expand Down