diff --git a/altair/utils/core.py b/altair/utils/core.py index 53785b174..c47b9a04d 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -193,8 +193,6 @@ def infer_vegalite_type(data): # Otherwise, infer based on the dtype of the input typ = infer_dtype(data) - # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py - if typ in [ "floating", "mixed-integer-float", @@ -203,6 +201,8 @@ def infer_vegalite_type(data): "complex", ]: return "quantitative" + elif typ == "categorical" and data.cat.ordered: + return ("ordinal", data.cat.categories.tolist()) elif typ in ["string", "bytes", "categorical", "boolean", "mixed", "unicode"]: return "nominal" elif typ in [ @@ -316,8 +316,9 @@ def to_list_if_array(val): for col_name, dtype in df.dtypes.items(): if str(dtype) == "category": - # XXXX: work around bug in to_json for categorical types + # Work around bug in to_json for categorical types in older versions of pandas # https://github.com/pydata/pandas/issues/10778 + # https://github.com/altair-viz/altair/pull/2170 col = df[col_name].astype(object) df[col_name] = col.where(col.notnull(), None) elif str(dtype) == "string": @@ -527,6 +528,10 @@ def parse_shorthand( if isinstance(data, pd.DataFrame) and "type" not in attrs: if "field" in attrs and attrs["field"] in data.columns: attrs["type"] = infer_vegalite_type(data[attrs["field"]]) + # ordered categorical dataframe columns return the type and sort order as a tuple + if isinstance(attrs["type"], tuple): + attrs["sort"] = attrs["type"][1] + attrs["type"] = attrs["type"][0] return attrs diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b9f8bae9a..fab24a4ad 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -364,6 +364,14 @@ def to_dict(self, validate=True, ignore=None, context=None): # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) + # Prevent that pandas categorical data is automatically sorted + # when a non-ordinal data type is specifed manually + if "sort" in parsed_shorthand and kwds["type"] not in [ + "ordinal", + Undefined, + ]: + parsed_shorthand.pop("sort") + kwds.update( { k: v diff --git a/doc/user_guide/encodings/channels.rst b/doc/user_guide/encodings/channels.rst index 057db9c71..073580b21 100644 --- a/doc/user_guide/encodings/channels.rst +++ b/doc/user_guide/encodings/channels.rst @@ -110,7 +110,7 @@ We map the ``symbol`` variable to ``detail`` to use them to group lines. Order ~~~~~ -The `order` option and :class:`Order` channel can sort how marks are drawn on the chart. +The ``order`` option and :class:`Order` channel can sort how marks are drawn on the chart. For stacked marks, this controls the order of components of the stack. Here, the elements of each bar are sorted alphabetically by the name of the nominal data in the color channel. @@ -144,9 +144,6 @@ The order can be reversed by changing the sort option to `descending`. order=alt.Order("site", sort="descending") ) -If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment `_, although there might be edge cases where this is not fully supported. This also makes the order of the segments align with the order colors shows up in a legend that uses custom sorting for the color domain. - - The same approach works for other mark types, like stacked areas charts. .. altair-plot:: @@ -163,7 +160,12 @@ The same approach works for other mark types, like stacked areas charts. order=alt.Order("site", sort="ascending") ) -For line marks, the `order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes. +Note that unlike the ``sort`` parameter to positional encoding channels, +the :class:`Order` channel cannot take a list of values to sort by +and is not automatically sorted when an ordered pandas categorical column is passed. +If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment `_, although there might be edge cases where this is not fully supported. This workaround also makes the order of the segments align with the order that the colors shows up in a legend that uses custom sorting for the color domain. + +For line marks, the :class:`Order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes. .. altair-plot:: diff --git a/doc/user_guide/encodings/index.rst b/doc/user_guide/encodings/index.rst index b46acf6b6..32073259e 100644 --- a/doc/user_guide/encodings/index.rst +++ b/doc/user_guide/encodings/index.rst @@ -318,18 +318,22 @@ Sort Option ~~~~~~~~~~~ Some channels accept a :class:`sort` option which determines the -order of the scale being used for the channel. There are a number of different -sort options available: +order of the scale being used for the channel. +By default the scale is sorted in ascending alphabetical order, +unless an `ordered pandas categorical column `_ is passed (without an explicit type specification) +in which case Altair will use the column's inherent order to sort the scale. +There are a number of different +options available to change the sort order: - ``sort='ascending'`` (Default) will sort the field's value in ascending order. - for string data, this uses standard alphabetical order. + For string data, this uses standard alphabetical order. - ``sort='descending'`` will sort the field's value in descending order -- passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for +- Passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for sorting by that channel. An optional minus prefix can be used for a descending sort. For example ``sort='-x'`` would sort by the x channel in descending order. -- passing a list to ``sort`` allows you to explicitly set the order in which +- Passing a list to ``sort`` allows you to explicitly set the order in which you would like the encoding to appear -- passing a :class:`EncodingSortField` class to ``sort`` allows you to sort +- Passing a :class:`EncodingSortField` class to ``sort`` allows you to sort an axis by the value of some other field in the dataset. Here is an example of applying these five different sort approaches on the diff --git a/tests/vegalite/v5/tests/test_api.py b/tests/vegalite/v5/tests/test_api.py index b2e5aa159..12e8b6668 100644 --- a/tests/vegalite/v5/tests/test_api.py +++ b/tests/vegalite/v5/tests/test_api.py @@ -123,6 +123,7 @@ def test_chart_infer_types(): "x": pd.date_range("2012", periods=10, freq="Y"), "y": range(10), "c": list("abcabcabca"), + "s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True), } ) @@ -134,32 +135,45 @@ def _check_encodings(chart): assert dct["encoding"]["y"]["field"] == "y" assert dct["encoding"]["color"]["type"] == "nominal" assert dct["encoding"]["color"]["field"] == "c" + assert dct["encoding"]["size"]["type"] == "ordinal" + assert dct["encoding"]["size"]["field"] == "s" + assert dct["encoding"]["size"]["sort"] == [2, 1] # Pass field names by keyword - chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c") + chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s") _check_encodings(chart) # pass Channel objects by keyword chart = ( alt.Chart(data) .mark_point() - .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c")) + .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s")) ) _check_encodings(chart) # pass Channel objects by value - chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c")) + chart = ( + alt.Chart(data) + .mark_point() + .encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s")) + ) _check_encodings(chart) # override default types chart = ( alt.Chart(data) .mark_point() - .encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal")) + .encode( + alt.X("x", type="nominal"), + alt.Y("y", type="ordinal"), + alt.Size("s", type="nominal"), + ) ) dct = chart.to_dict() assert dct["encoding"]["x"]["type"] == "nominal" assert dct["encoding"]["y"]["type"] == "ordinal" + assert dct["encoding"]["size"]["type"] == "nominal" + assert "sort" not in dct["encoding"]["size"] @pytest.mark.parametrize( diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index aeaf156cf..d04bdf2df 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -362,6 +362,14 @@ def to_dict(self, validate=True, ignore=None, context=None): # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) + # Prevent that pandas categorical data is automatically sorted + # when a non-ordinal data type is specifed manually + if "sort" in parsed_shorthand and kwds["type"] not in [ + "ordinal", + Undefined, + ]: + parsed_shorthand.pop("sort") + kwds.update( { k: v