From a74a60508edd844ef425637f27598f1d8c5385a2 Mon Sep 17 00:00:00 2001 From: oliverhiggs Date: Sun, 8 Sep 2024 05:13:56 +1000 Subject: [PATCH] Support additional dtypes in `resample` (#9413) * Support additional dtypes to resample pandas.BaseOffset, pandas.Timedelta, datetime.timedelta, and BaseCFTimeOffset are now all supported datatypes for resampling. * Update whats-new * Fix types * Add unit test * Fix test * Support more dtypes for CFTimeIndex resampling * Tidy resample type hints * Fix some mypy bugs * Fixes * Fix tests * WIP * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update doc/whats-new.rst * Apply suggestions from code review Co-authored-by: Spencer Clark * Fix mypy error * Fix bad edit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian Co-authored-by: Spencer Clark --- doc/whats-new.rst | 4 ++ xarray/coding/cftime_offsets.py | 42 +++++++++++++-- xarray/core/common.py | 16 +++--- xarray/core/dataarray.py | 9 ++-- xarray/core/dataset.py | 9 ++-- xarray/core/resample_cftime.py | 4 +- xarray/core/types.py | 2 + xarray/groupers.py | 20 +++++-- xarray/tests/test_groupby.py | 95 +++++++++++++++++++++++++++++---- 9 files changed, 168 insertions(+), 33 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 54fea2b73ea..868478260f4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,6 +78,10 @@ Bug fixes - Fix deprecation warning that was raised when calling ``np.array`` on an ``xr.DataArray`` in NumPy 2.0 (:issue:`9312`, :pull:`9393`) By `Andrew Scherer `_. +- Fix support for using ``pandas.DateOffset``, ``pandas.Timedelta``, and + ``datetime.timedelta`` objects as ``resample`` frequencies + (:issue:`9408`, :pull:`9413`). + By `Oliver Higgs `_. Performance ~~~~~~~~~~~ diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index f7bed2c13ef..0167119e98e 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -47,7 +47,7 @@ from collections.abc import Mapping from datetime import datetime, timedelta from functools import partial -from typing import TYPE_CHECKING, ClassVar, Literal +from typing import TYPE_CHECKING, ClassVar, Literal, TypeVar import numpy as np import pandas as pd @@ -80,6 +80,7 @@ DayOption: TypeAlias = Literal["start", "end"] +T_FreqStr = TypeVar("T_FreqStr", str, None) def _nanosecond_precision_timestamp(*args, **kwargs): @@ -772,11 +773,18 @@ def _emit_freq_deprecation_warning(deprecated_freq): emit_user_level_warning(message, FutureWarning) -def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffset: +def to_offset( + freq: BaseCFTimeOffset | str | timedelta | pd.Timedelta | pd.DateOffset, + warn: bool = True, +) -> BaseCFTimeOffset: """Convert a frequency string to the appropriate subclass of BaseCFTimeOffset.""" if isinstance(freq, BaseCFTimeOffset): return freq + if isinstance(freq, timedelta | pd.Timedelta): + return delta_to_tick(freq) + if isinstance(freq, pd.DateOffset): + freq = _legacy_to_new_freq(freq.freqstr) match = re.match(_PATTERN, freq) if match is None: @@ -791,6 +799,34 @@ def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffs return _FREQUENCIES[freq](n=multiples) +def delta_to_tick(delta: timedelta | pd.Timedelta) -> Tick: + """Adapted from pandas.tslib.delta_to_tick""" + if isinstance(delta, pd.Timedelta) and delta.nanoseconds != 0: + # pandas.Timedelta has nanoseconds, but these are not supported + raise ValueError( + "Unable to convert 'pandas.Timedelta' object with non-zero " + "nanoseconds to 'CFTimeOffset' object" + ) + if delta.microseconds == 0: + if delta.seconds == 0: + return Day(n=delta.days) + else: + seconds = delta.days * 86400 + delta.seconds + if seconds % 3600 == 0: + return Hour(n=seconds // 3600) + elif seconds % 60 == 0: + return Minute(n=seconds // 60) + else: + return Second(n=seconds) + else: + # Regardless of the days and seconds this will always be a Millisecond + # or Microsecond object + if delta.microseconds % 1_000 == 0: + return Millisecond(n=delta.microseconds // 1_000) + else: + return Microsecond(n=delta.microseconds) + + def to_cftime_datetime(date_str_or_date, calendar=None): if cftime is None: raise ModuleNotFoundError("No module named 'cftime'") @@ -1332,7 +1368,7 @@ def _new_to_legacy_freq(freq): return freq -def _legacy_to_new_freq(freq): +def _legacy_to_new_freq(freq: T_FreqStr) -> T_FreqStr: # to avoid internal deprecation warnings when freq is determined using pandas < 2.2 # TODO: remove once requiring pandas >= 2.2 diff --git a/xarray/core/common.py b/xarray/core/common.py index 74c03f9baf5..1ed1398746f 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime import warnings from collections.abc import Callable, Hashable, Iterable, Iterator, Mapping from contextlib import suppress @@ -13,6 +14,7 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs +from xarray.core.types import ResampleCompatible from xarray.core.utils import ( Frozen, either_dict_or_kwargs, @@ -32,8 +34,6 @@ if TYPE_CHECKING: - import datetime - from numpy.typing import DTypeLike from xarray.core.dataarray import DataArray @@ -891,14 +891,14 @@ def rolling_exp( def _resample( self, resample_cls: type[T_Resample], - indexer: Mapping[Hashable, str | Resampler] | None, + indexer: Mapping[Hashable, ResampleCompatible | Resampler] | None, skipna: bool | None, closed: SideOptions | None, label: SideOptions | None, offset: pd.Timedelta | datetime.timedelta | str | None, origin: str | DatetimeLike, restore_coord_dims: bool | None, - **indexer_kwargs: str | Resampler, + **indexer_kwargs: ResampleCompatible | Resampler, ) -> T_Resample: """Returns a Resample object for performing resampling operations. @@ -1078,14 +1078,18 @@ def _resample( ) grouper: Resampler - if isinstance(freq, str): + if isinstance(freq, ResampleCompatible): grouper = TimeResampler( freq=freq, closed=closed, label=label, origin=origin, offset=offset ) elif isinstance(freq, Resampler): grouper = freq else: - raise ValueError("freq must be a str or a Resampler object") + raise ValueError( + "freq must be an object of type 'str', 'datetime.timedelta', " + "'pandas.Timedelta', 'pandas.DateOffset', or 'TimeResampler'. " + f"Received {type(freq)} instead." + ) rgrouper = ResolvedGrouper(grouper, group, self) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e3ba92c21cd..a0e34e8f9cc 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -111,6 +111,7 @@ QueryEngineOptions, QueryParserOptions, ReindexMethodOptions, + ResampleCompatible, Self, SideOptions, T_ChunkDimFreq, @@ -7269,7 +7270,7 @@ def coarsen( @_deprecate_positional_args("v2024.07.0") def resample( self, - indexer: Mapping[Hashable, str | Resampler] | None = None, + indexer: Mapping[Hashable, ResampleCompatible | Resampler] | None = None, *, skipna: bool | None = None, closed: SideOptions | None = None, @@ -7277,7 +7278,7 @@ def resample( offset: pd.Timedelta | datetime.timedelta | str | None = None, origin: str | DatetimeLike = "start_day", restore_coord_dims: bool | None = None, - **indexer_kwargs: str | Resampler, + **indexer_kwargs: ResampleCompatible | Resampler, ) -> DataArrayResample: """Returns a Resample object for performing resampling operations. @@ -7288,7 +7289,7 @@ def resample( Parameters ---------- - indexer : Mapping of Hashable to str, optional + indexer : Mapping of Hashable to str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler, optional Mapping from the dimension name to resample frequency [1]_. The dimension must be datetime-like. skipna : bool, optional @@ -7312,7 +7313,7 @@ def resample( restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. - **indexer_kwargs : str + **indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a7b52dc0185..671df273759 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -163,6 +163,7 @@ QueryEngineOptions, QueryParserOptions, ReindexMethodOptions, + ResampleCompatible, SideOptions, T_ChunkDimFreq, T_DatasetPadConstantValues, @@ -10710,7 +10711,7 @@ def coarsen( @_deprecate_positional_args("v2024.07.0") def resample( self, - indexer: Mapping[Any, str | Resampler] | None = None, + indexer: Mapping[Any, ResampleCompatible | Resampler] | None = None, *, skipna: bool | None = None, closed: SideOptions | None = None, @@ -10718,7 +10719,7 @@ def resample( offset: pd.Timedelta | datetime.timedelta | str | None = None, origin: str | DatetimeLike = "start_day", restore_coord_dims: bool | None = None, - **indexer_kwargs: str | Resampler, + **indexer_kwargs: ResampleCompatible | Resampler, ) -> DatasetResample: """Returns a Resample object for performing resampling operations. @@ -10729,7 +10730,7 @@ def resample( Parameters ---------- - indexer : Mapping of Hashable to str, optional + indexer : Mapping of Hashable to str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler, optional Mapping from the dimension name to resample frequency [1]_. The dimension must be datetime-like. skipna : bool, optional @@ -10753,7 +10754,7 @@ def resample( restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. - **indexer_kwargs : str + **indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 2149a62dfb5..c084640e763 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -58,7 +58,7 @@ from xarray.core.types import SideOptions if typing.TYPE_CHECKING: - from xarray.core.types import CFTimeDatetime + from xarray.core.types import CFTimeDatetime, ResampleCompatible class CFTimeGrouper: @@ -75,7 +75,7 @@ class CFTimeGrouper: def __init__( self, - freq: str | BaseCFTimeOffset, + freq: ResampleCompatible | BaseCFTimeOffset, closed: SideOptions | None = None, label: SideOptions | None = None, origin: str | CFTimeDatetime = "start_day", diff --git a/xarray/core/types.py b/xarray/core/types.py index a9c2771cb9f..34b6029ee15 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -318,3 +318,5 @@ def copy( Bins = Union[ int, Sequence[int], Sequence[float], Sequence[pd.Timestamp], np.ndarray, pd.Index ] + +ResampleCompatible: TypeAlias = str | datetime.timedelta | pd.Timedelta | pd.DateOffset diff --git a/xarray/groupers.py b/xarray/groupers.py index 6a47c609422..9c24a96077f 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -14,14 +14,20 @@ import numpy as np import pandas as pd -from xarray.coding.cftime_offsets import _new_to_legacy_freq +from xarray.coding.cftime_offsets import BaseCFTimeOffset, _new_to_legacy_freq from xarray.core import duck_array_ops from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.groupby import T_Group, _DummyGroup from xarray.core.indexes import safe_cast_to_index from xarray.core.resample_cftime import CFTimeGrouper -from xarray.core.types import Bins, DatetimeLike, GroupIndices, SideOptions +from xarray.core.types import ( + Bins, + DatetimeLike, + GroupIndices, + ResampleCompatible, + SideOptions, +) from xarray.core.variable import Variable __all__ = [ @@ -336,7 +342,7 @@ class TimeResampler(Resampler): Attributes ---------- - freq : str + freq : str, datetime.timedelta, pandas.Timestamp, or pandas.DateOffset Frequency to resample to. See `Pandas frequency aliases `_ for a list of possible values. @@ -358,7 +364,7 @@ class TimeResampler(Resampler): An offset timedelta added to the origin. """ - freq: str + freq: ResampleCompatible closed: SideOptions | None = field(default=None) label: SideOptions | None = field(default=None) origin: str | DatetimeLike = field(default="start_day") @@ -388,6 +394,12 @@ def _init_properties(self, group: T_Group) -> None: offset=offset, ) else: + if isinstance(self.freq, BaseCFTimeOffset): + raise ValueError( + "'BaseCFTimeOffset' resample frequencies are only supported " + "when resampling a 'CFTimeIndex'" + ) + self.index_grouper = pd.Grouper( # TODO remove once requiring pandas >= 2.2 freq=_new_to_legacy_freq(self.freq), diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index a6de4697b8a..906a015544b 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime import operator import warnings from unittest import mock @@ -13,7 +14,7 @@ from xarray import DataArray, Dataset, Variable from xarray.core.alignment import broadcast from xarray.core.groupby import _consolidate_slices -from xarray.core.types import InterpOptions +from xarray.core.types import InterpOptions, ResampleCompatible from xarray.groupers import ( BinGrouper, EncodedGroups, @@ -757,7 +758,6 @@ def test_groupby_none_group_name() -> None: def test_groupby_getitem(dataset) -> None: - assert_identical(dataset.sel(x=["a"]), dataset.groupby("x")["a"]) assert_identical(dataset.sel(z=[1]), dataset.groupby("z")[1]) assert_identical(dataset.foo.sel(x=["a"]), dataset.foo.groupby("x")["a"]) @@ -1773,7 +1773,21 @@ def test_groupby_fastpath_for_monotonic(self, use_flox: bool) -> None: class TestDataArrayResample: @pytest.mark.parametrize("use_cftime", [True, False]) - def test_resample(self, use_cftime: bool) -> None: + @pytest.mark.parametrize( + "resample_freq", + [ + "24h", + "123456s", + "1234567890us", + pd.Timedelta(hours=2), + pd.offsets.MonthBegin(), + pd.offsets.Second(123456), + datetime.timedelta(days=1, hours=6), + ], + ) + def test_resample( + self, use_cftime: bool, resample_freq: ResampleCompatible + ) -> None: if use_cftime and not has_cftime: pytest.skip() times = xr.date_range( @@ -1795,23 +1809,23 @@ def resample_as_pandas(array, *args, **kwargs): array = DataArray(np.arange(10), [("time", times)]) - actual = array.resample(time="24h").mean() - expected = resample_as_pandas(array, "24h") + actual = array.resample(time=resample_freq).mean() + expected = resample_as_pandas(array, resample_freq) assert_identical(expected, actual) - actual = array.resample(time="24h").reduce(np.mean) + actual = array.resample(time=resample_freq).reduce(np.mean) assert_identical(expected, actual) - actual = array.resample(time="24h", closed="right").mean() - expected = resample_as_pandas(array, "24h", closed="right") + actual = array.resample(time=resample_freq, closed="right").mean() + expected = resample_as_pandas(array, resample_freq, closed="right") assert_identical(expected, actual) with pytest.raises(ValueError, match=r"Index must be monotonic"): - array[[2, 0, 1]].resample(time="1D") + array[[2, 0, 1]].resample(time=resample_freq) reverse = array.isel(time=slice(-1, None, -1)) with pytest.raises(ValueError): - reverse.resample(time="1D").mean() + reverse.resample(time=resample_freq).mean() @pytest.mark.parametrize("use_cftime", [True, False]) def test_resample_doctest(self, use_cftime: bool) -> None: @@ -2206,6 +2220,67 @@ def test_resample_origin(self) -> None: class TestDatasetResample: + @pytest.mark.parametrize("use_cftime", [True, False]) + @pytest.mark.parametrize( + "resample_freq", + [ + "24h", + "123456s", + "1234567890us", + pd.Timedelta(hours=2), + pd.offsets.MonthBegin(), + pd.offsets.Second(123456), + datetime.timedelta(days=1, hours=6), + ], + ) + def test_resample( + self, use_cftime: bool, resample_freq: ResampleCompatible + ) -> None: + if use_cftime and not has_cftime: + pytest.skip() + times = xr.date_range( + "2000-01-01", freq="6h", periods=10, use_cftime=use_cftime + ) + + def resample_as_pandas(ds, *args, **kwargs): + ds_ = ds.copy(deep=True) + if use_cftime: + ds_["time"] = times.to_datetimeindex() + result = Dataset.from_dataframe( + ds_.to_dataframe().resample(*args, **kwargs).mean() + ) + if use_cftime: + result = result.convert_calendar( + calendar="standard", use_cftime=use_cftime + ) + return result + + ds = Dataset( + { + "foo": ("time", np.random.randint(1, 1000, 10)), + "bar": ("time", np.random.randint(1, 1000, 10)), + "time": times, + } + ) + + actual = ds.resample(time=resample_freq).mean() + expected = resample_as_pandas(ds, resample_freq) + assert_identical(expected, actual) + + actual = ds.resample(time=resample_freq).reduce(np.mean) + assert_identical(expected, actual) + + actual = ds.resample(time=resample_freq, closed="right").mean() + expected = resample_as_pandas(ds, resample_freq, closed="right") + assert_identical(expected, actual) + + with pytest.raises(ValueError, match=r"Index must be monotonic"): + ds.isel(time=[2, 0, 1]).resample(time=resample_freq) + + reverse = ds.isel(time=slice(-1, None, -1)) + with pytest.raises(ValueError): + reverse.resample(time=resample_freq).mean() + def test_resample_and_first(self) -> None: times = pd.date_range("2000-01-01", freq="6h", periods=10) ds = Dataset(