Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor base indicator classes #1446

Merged
merged 10 commits into from
Aug 31, 2023
8 changes: 6 additions & 2 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ New features and enhancements
* Added ``ensembles.hawkins_sutton`` method to partition the uncertainty sources in a climate projection ensemble. (:issue:`771`, :pull:`1262`).
* New function ``xclim.core.calendar.convert_doy`` to transform day-of-year data between calendars. Also accessible from ``convert_calendar`` with ``doy=True``. (:issue:`1283`, :pull:`1406`).
* Add support for setting optional variables through the `ds` argument. (:issue:`1432`, :pull:`1435`).
* New ``xclim.core.calendar.offset_is_divisor`` to test if a given freq divides another one evenly (:pull:`1446`).
* Missing value objects now support input timeseries of quarterly and yearly frequencies (:pull:`1446`).
* Missing value checks enabled for all "generic" indicators (``return_level``, ``fit`` and ``stats``) (:pull:`1446`).

Bug fixes
^^^^^^^^^
* Fix `kldiv` docstring so the math formula renders to HTML. (:issue:`1408`, :pull:`1409`).
* Fix ``kldiv`` docstring so the math formula renders to HTML. (:issue:`1408`, :pull:`1409`).
* Fix the registry entries of "generic" indicators. (:issue:`1423`, :pull:`1424`).
* Fix `jetstream_metric_woollings` so it uses the `vertical` coordinate identified by `cf-xarray`, instead of `pressure`. (:issue:`1421`, :pull:`1422`). Add logic to handle coordinates in decreasing order, or for longitudes defined from 0-360 instead of -180 to 180. (:issue:`1429`, :pull:`1430`).
* Fix ``jetstream_metric_woollings`` so it uses the `vertical` coordinate identified by `cf-xarray`, instead of `pressure`. (:issue:`1421`, :pull:`1422`). Add logic to handle coordinates in decreasing order, or for longitudes defined from 0-360 instead of -180 to 180. (:issue:`1429`, :pull:`1430`).
* Fix virtual indicator attribute assignment causing individual indicator's realm to be ignored. (:issue:`1425`, :pull:`1426`).

Internal changes
Expand All @@ -25,6 +28,7 @@ Internal changes
* Increased the guess of number of quantiles needed in ExtremeValues. (:pull:`1413`).
* Tolerance thresholds for error in ``test_processing::test_adapt_freq`` have been relaxed to allow for more variation in the results. (:issue:`1417`, :pull:`1418`).
* Added 'streamflow' to the list of known variables (:pull:`1431`).
* Refactor base indicator classes and fix misleading inheritance of ``return_level`` (:issue:`1263`, :pull:`1446`).

v0.44.0 (2023-06-23)
--------------------
Expand Down
14 changes: 11 additions & 3 deletions tests/test_generic_indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,18 @@ def test_simple(self, pr_ndseries, random):
ts = generic.stats(pr, freq="YS", op="max")
p = generic.fit(ts, dist="gumbel_r")
assert p.attrs["estimator"] == "Maximum likelihood"
assert "time" not in p.dims

def test_nan(self, pr_series, random):
r = random.random(22)
r[0] = np.nan
pr = pr_series(r)

out = generic.fit(pr, dist="norm")
assert not np.isnan(out.values[0])
assert np.isnan(out.values[0])
with set_options(check_missing="skip"):
out = generic.fit(pr, dist="norm")
assert not np.isnan(out.values[0])

def test_ndim(self, pr_ndseries, random):
pr = pr_ndseries(random.random((100, 1, 2)))
Expand All @@ -28,6 +32,9 @@ def test_ndim(self, pr_ndseries, random):

def test_options(self, q_series, random):
q = q_series(random.random(19))
out = generic.fit(q, dist="norm")
np.testing.assert_array_equal(out.isnull(), False)

with set_options(missing_options={"at_least_n": {"n": 10}}):
out = generic.fit(q, dist="norm")
np.testing.assert_array_equal(out.isnull(), False)
Expand Down Expand Up @@ -87,8 +94,9 @@ def test_ndq(self, ndq_series):
assert out.attrs["units"] == "m3 s-1"

def test_missing(self, ndq_series):
a = ndq_series
a = ndq_series.where(~((a.time.dt.dayofyear == 5) * (a.time.dt.year == 1902)))
a = ndq_series.where(
~((ndq_series.time.dt.dayofyear == 5) & (ndq_series.time.dt.year == 1902))
)
assert a.shape == (5000, 2, 3)
out = generic.stats(a, op="max", month=1)

Expand Down
22 changes: 22 additions & 0 deletions tests/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,21 @@ def test_monthly_input(self, random):
mb = missing.MissingBase(ts, freq="AS", src_timestep="M", season="JJA")
assert mb.count == 3

def test_seasonal_input(self, random):
"""Creating array with 11 seasons."""
n = 11
time = xr.cftime_range(start="2002-04-01", periods=n, freq="QS-JAN")
ts = xr.DataArray(random.random(n), dims="time", coords={"time": time})
mb = missing.MissingBase(ts, freq="YS", src_timestep="QS-JAN")
# Make sure count is 12, because we're requesting a YS freq.
np.testing.assert_array_equal(mb.count, [4, 4, 4, 1])

with pytest.raises(
NotImplementedError,
match="frequency that is not aligned with the source timestep.",
):
missing.MissingBase(ts, freq="YS", src_timestep="QS-DEC")


class TestMissingAnyFills:
def test_missing_days(self, tas_series):
Expand Down Expand Up @@ -144,6 +159,13 @@ def test_hourly(self, pr_hr_series):
out = missing.missing_any(pr, freq="MS")
np.testing.assert_array_equal(out, [True, False, True])

def test_seasonal(self, random):
n = 11
time = xr.cftime_range(start="2002-01-01", periods=n, freq="QS-JAN")
ts = xr.DataArray(random.random(n), dims="time", coords={"time": time})
out = missing.missing_any(ts, freq="YS")
np.testing.assert_array_equal(out, [False, False, True])


class TestMissingWMO:
def test_missing_days(self, tas_series):
Expand Down
53 changes: 53 additions & 0 deletions xclim/core/calendar.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"get_calendar",
"interp_calendar",
"max_doy",
"offset_is_divisor",
"parse_offset",
"percentile_doy",
"resample_doy",
Expand Down Expand Up @@ -834,6 +835,58 @@ def construct_offset(mult: int, base: str, start_anchored: bool, anchor: str | N
)


def offset_is_divisor(freqA: str, freqB: str):
aulemahal marked this conversation as resolved.
Show resolved Hide resolved
"""Check that freqA is a divisor of freqB.

A frequency is a "divisor" of another if a whole number of periods of the
former fit within a single period of the latter.

Parameters
----------
freqA: str
The divisor frequency.
freqB: str
The large frequency.

Returns
-------
bool

Examples
--------
>>> offset_is_divisor("QS-Jan", "YS")
True
>>> offset_is_divisor("QS-DEC", "AS-JUL")
False
>>> offset_is_divisor("D", "M")
True
"""
if compare_offsets(freqA, ">", freqB):
return False
# Reconstruct offsets anchored at the start of the period
# to have comparable quantities, also get "offset" objects
mA, bA, sA, aA = parse_offset(freqA)
offAs = pd.tseries.frequencies.to_offset(construct_offset(mA, bA, True, aA))

mB, bB, sB, aB = parse_offset(freqB)
offBs = pd.tseries.frequencies.to_offset(construct_offset(mB, bB, True, aB))
tB = pd.date_range("1970-01-01T00:00:00", freq=offBs, periods=13)

if bA in "WDHTLUN" or bB in "WDHTLUN":
# Simple length comparison is sufficient for submonthly freqs
# In case one of bA or bB is > W, we test many to be sure.
tA = pd.date_range("1970-01-01T00:00:00", freq=offAs, periods=13)
return np.all(
(np.diff(tB)[:, np.newaxis] / np.diff(tA)[np.newaxis, :]) % 1 == 0
)

# else, we test alignment with some real dates
# If both fall on offAs, then is means freqA is aligned with freqB at those dates
# if N=13 is True, then it is always True
# As freqA <= freqB, this means freqA is a "divisor" of freqB.
return all(offAs.is_on_offset(d) for d in tB)


def _interpolate_doy_calendar(
source: xr.DataArray, doy_max: int, doy_min: int = 1
) -> xr.DataArray:
Expand Down
117 changes: 76 additions & 41 deletions xclim/core/indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,11 +1355,11 @@ def _show_deprecation_warning(self):
)


class ResamplingIndicator(Indicator):
"""Indicator that performs a resampling computation.
class CheckMissingIndicator(Indicator):
"""Class for indicators that completely reduce the time dimension, adding missing value checks.

Compared to the base Indicator, this adds the handling of missing data,
and the check of allowed periods.
A full reduction of the "time" dimension is expected by default: the missing step will fail if the output still has a time dimension.
To enable resampling, the :py:meth:`_get_missing_freq` method can be subclassed to return the resampling frequency. The method is always called with the indicator parameters.
aulemahal marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
Expand All @@ -1368,24 +1368,10 @@ class ResamplingIndicator(Indicator):
None, this will be determined by the global configuration (see `xclim.set_options`). Defaults to "from_context".
missing_options : dict, optional
Arguments to pass to the `missing` function. If None, this will be determined by the global configuration.
allowed_periods : Sequence[str], optional
A list of allowed periods, i.e. base parts of the `freq` parameter. For example, indicators meant to be
computed annually only will have `allowed_periods=["A"]`. `None` means "any period" or that the
indicator doesn't take a `freq` argument.
"""

missing = "from_context"
missing_options: dict | None = None
allowed_periods: list[str] | None = None

@classmethod
def _ensure_correct_parameters(cls, parameters):
if "freq" not in parameters:
raise ValueError(
"ResamplingIndicator require a 'freq' argument, use the base Indicator"
" class if your computation doesn't perform any resampling."
)
return super()._ensure_correct_parameters(parameters)

def __init__(self, **kwds):
if self.missing == "from_context" and self.missing_options is not None:
Expand All @@ -1401,23 +1387,6 @@ def __init__(self, **kwds):

super().__init__(**kwds)

def _preprocess_and_checks(self, das, params):
"""Perform parent's checks and also check if freq is allowed."""
das, params = super()._preprocess_and_checks(das, params)

# Check if the period is allowed:
if (
self.allowed_periods is not None
and parse_offset(params["freq"])[1] not in self.allowed_periods
):
raise ValueError(
f"Resampling frequency {params['freq']} is not allowed for indicator "
f"{self.identifier} (needs something equivalent to one "
f"of {self.allowed_periods})."
)

return das, params

def _history_string(self, **kwargs):
if self.missing == "from_context":
missing = OPTIONS[CHECK_MISSING]
Expand All @@ -1432,6 +1401,10 @@ def _history_string(self, **kwargs):

return super()._history_string(**kwargs) + opt_str

def _get_missing_freq(self, params):
"""Return the resampling frequency to be used in the missing values check."""
return None

def _postprocess(self, outs, das, params):
"""Masking of missing values."""
outs = super()._postprocess(outs, das, params)
Expand All @@ -1445,25 +1418,79 @@ def _postprocess(self, outs, das, params):

# We flag periods according to the missing method. skip variables without a time coordinate.
src_freq = self.src_freq if isinstance(self.src_freq, str) else None
freq = self._get_missing_freq(params)
miss = (
self._missing(
da, params["freq"], src_freq, options, params.get("indexer", {})
)
self._missing(da, freq, src_freq, options, params.get("indexer", {}))
for da in das.values()
if "time" in da.coords
)
# Reduce by or and broadcast to ensure the same length in time
# When indexing is used and there are no valid points in the last period, mask will not include it
mask = reduce(np.logical_or, miss)
if isinstance(mask, DataArray) and mask.time.size < outs[0].time.size:
if (
isinstance(mask, DataArray)
and "time" in mask.dims
and mask.time.size < outs[0].time.size
):
mask = mask.reindex(time=outs[0].time, fill_value=True)
outs = [out.where(~mask) for out in outs]

return outs


class ResamplingIndicatorWithIndexing(ResamplingIndicator):
"""Resampling indicator that also injects "indexer" kwargs to subset the inputs before computation."""
class ResamplingIndicator(CheckMissingIndicator):
"""Indicator that performs a resampling computation.

Compared to the base Indicator, this adds the handling of missing data,
and the check of allowed periods.

Parameters
----------
missing : {any, wmo, pct, at_least_n, skip, from_context}
The name of the missing value method. See `xclim.core.missing.MissingBase` to create new custom methods. If
None, this will be determined by the global configuration (see `xclim.set_options`). Defaults to "from_context".
missing_options : dict, optional
Arguments to pass to the `missing` function. If None, this will be determined by the global configuration.
allowed_periods : Sequence[str], optional
A list of allowed periods, i.e. base parts of the `freq` parameter. For example, indicators meant to be
computed annually only will have `allowed_periods=["A"]`. `None` means "any period" or that the
indicator doesn't take a `freq` argument.
"""

allowed_periods: list[str] | None = None

@classmethod
def _ensure_correct_parameters(cls, parameters):
if "freq" not in parameters:
raise ValueError(
"ResamplingIndicator require a 'freq' argument, use the base Indicator"
" class if your computation doesn't perform any resampling."
)
return super()._ensure_correct_parameters(parameters)

def _get_missing_freq(self, params):
return params["freq"]

def _preprocess_and_checks(self, das, params):
"""Perform parent's checks and also check if freq is allowed."""
das, params = super()._preprocess_and_checks(das, params)

# Check if the period is allowed:
if (
self.allowed_periods is not None
and parse_offset(params["freq"])[1] not in self.allowed_periods
):
raise ValueError(
f"Resampling frequency {params['freq']} is not allowed for indicator "
f"{self.identifier} (needs something equivalent to one "
f"of {self.allowed_periods})."
)

return das, params


class IndexingIndicator(Indicator):
"""Indicator that also injects "indexer" kwargs to subset the inputs before computation."""

@classmethod
def _injected_parameters(cls):
Expand Down Expand Up @@ -1492,6 +1519,12 @@ def _preprocess_and_checks(self, das: dict[str, DataArray], params: dict[str, An
return das, params


class ResamplingIndicatorWithIndexing(ResamplingIndicator, IndexingIndicator):
"""Resampling indicator that also injects "indexer" kwargs to subset the inputs before computation."""

pass


class Daily(ResamplingIndicator):
"""Class for daily inputs and resampling computes."""

Expand All @@ -1505,6 +1538,8 @@ class Hourly(ResamplingIndicator):


base_registry["Indicator"] = Indicator
base_registry["CheckMissingIndicator"] = CheckMissingIndicator
base_registry["IndexingIndicator"] = IndexingIndicator
base_registry["ResamplingIndicator"] = ResamplingIndicator
base_registry["ResamplingIndicatorWithIndexing"] = ResamplingIndicatorWithIndexing
base_registry["Hourly"] = Hourly
Expand Down
21 changes: 17 additions & 4 deletions xclim/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@
import numpy as np
import xarray as xr

from .calendar import date_range, get_calendar, select_time
from .calendar import (
date_range,
get_calendar,
offset_is_divisor,
parse_offset,
select_time,
)
from .options import (
CHECK_MISSING,
MISSING_METHODS,
Expand Down Expand Up @@ -106,8 +112,8 @@ def prepare(self, da, freq, src_timestep, **indexer):
freq : str
Resampling frequency defining the periods defined in
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling.
src_timestep : {"D", "H"}
Expected input frequency.
src_timestep : str
Expected input frequency. See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling.
\*\*indexer : {dim: indexer}, optional
Time attribute and values over which to subset the array. For example, use season='DJF' to select winter
values, month=1 to select January, or month=[6,7,8] to select summer months. If not indexer is given,
Expand Down Expand Up @@ -142,7 +148,14 @@ def prepare(self, da, freq, src_timestep, **indexer):
start_time = i[:1]
end_time = i[-1:]

if indexer or "M" in src_timestep:
if freq is not None and not offset_is_divisor(src_timestep, freq):
raise NotImplementedError(
"Missing checks not implemented for timeseries resampled to a frequency that is not "
f"aligned with the source timestep. {src_timestep} is not a divisor of {freq}."
)

offset = parse_offset(src_timestep)
if indexer or offset[1] in "YAQM":
# Create a full synthetic time series and compare the number of days with the original series.
t = date_range(
start_time[0],
Expand Down
Loading