From 789134c05b301751984844af006ec6ac921ca825 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 17 Jan 2018 00:27:38 +0900 Subject: [PATCH 01/73] Rolling_window for np.ndarray --- xarray/core/indexing.py | 13 ++++++++++++ xarray/core/nputils.py | 40 ++++++++++++++++++++++++++++++++++++ xarray/tests/test_nputils.py | 19 ++++++++++++++++- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e06b045ad88..7a6f2245970 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -426,6 +426,11 @@ def __array__(self, dtype=None): key = BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) + def rolling(self, axis, window): + raise NotImplementedError('Rolling for {} is not implemented.' + 'Load your data first with ' + '.load() or .compute()'.format(type(self))) + def unwrap_explicit_indexer(key, target, allow): """Unwrap an explicit key into a tuple.""" @@ -815,6 +820,11 @@ def __setitem__(self, key, value): array, key = self._indexing_array_and_key(key) array[key] = value + def rolling(self, axis, window): + """ + """ + return nputils.rolling() + class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a dask array to support explicit indexing.""" @@ -923,3 +933,6 @@ def __getitem__(self, indexer): def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) + + def rolling(self, axis, window): + return NumpyIndexingAdapter(self.array.values).rolling(axis, window) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 8ac04752e85..543390609d9 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -133,3 +133,43 @@ def __setitem__(self, key, value): mixed_positions, vindex_positions = _advanced_indexer_subspaces(key) self._array[key] = np.moveaxis(value, vindex_positions, mixed_positions) + + +def rolling_window(a, window): + """ + Make an ndarray with a rolling window of the last dimension + + Parameters + ---------- + a : array_like + Array to add rolling window to + window : int + Size of rolling window + + Returns + ------- + Array that is a view of the original array with a added dimension + of size w. + + Examples + -------- + >>> x=np.arange(10).reshape((2,5)) + >>> np.rolling_window(x, 3) + array([[[0, 1, 2], [1, 2, 3], [2, 3, 4]], + [[5, 6, 7], [6, 7, 8], [7, 8, 9]]]) + + Calculate rolling mean of last dimension: + >>> np.mean(np.rolling_window(x, 3), -1) + array([[ 1., 2., 3.], + [ 6., 7., 8.]]) + + This function is taken from https://github.com/numpy/numpy/pull/31 + """ + if window < 1: + raise ValueError( + "`window` must be at least 1. Given : {}".format(window)) + if window > a.shape[-1]: + raise ValueError("`window` is too long. Given : {}".format(window)) + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 83445e4639f..24dcbf0bb0a 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -1,7 +1,9 @@ import numpy as np from numpy.testing import assert_array_equal +import pytest -from xarray.core.nputils import _is_contiguous, NumpyVIndexAdapter +from xarray.core.nputils import (_is_contiguous, NumpyVIndexAdapter, + rolling_window) def test_is_contiguous(): @@ -28,3 +30,18 @@ def test_vindex(): vindex[[0, 1], [0, 1], :] = vindex[[0, 1], [0, 1], :] vindex[[0, 1], :, [0, 1]] = vindex[[0, 1], :, [0, 1]] vindex[:, [0, 1], [0, 1]] = vindex[:, [0, 1], [0, 1]] + + +def test_rolling(): + x = np.array([0, 1, 2, 3, 4], dtype=float) + + actual = rolling_window(x, window=3) + expected = np.array([[0, 1, 2], + [1, 2, 3], + [2, 3, 4]], dtype=float) + assert_array_equal(actual, expected) + + x = np.stack([x, x * 1.1]) + actual = rolling_window(x, window=3) + expected = np.stack([expected, expected * 1.1], axis=0) + assert_array_equal(actual, expected) From fa4e85709a84922bea521bf28862f54ed6ca73a8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 17 Jan 2018 15:41:24 +0900 Subject: [PATCH 02/73] Add pad method to Variable --- xarray/core/indexing.py | 18 +++++++---- xarray/core/variable.py | 59 +++++++++++++++++++++++++++++++++++ xarray/tests/test_variable.py | 13 ++++++++ 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7a6f2245970..54021f1cb9f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -426,8 +426,8 @@ def __array__(self, dtype=None): key = BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) - def rolling(self, axis, window): - raise NotImplementedError('Rolling for {} is not implemented.' + def rolling_window(self, axis, window): + raise NotImplementedError('rolling_windows for {} is not implemented.' 'Load your data first with ' '.load() or .compute()'.format(type(self))) @@ -820,10 +820,15 @@ def __setitem__(self, key, value): array, key = self._indexing_array_and_key(key) array[key] = value - def rolling(self, axis, window): + def rolling_window(self, axis, window): """ + Make an ndarray with a rolling window of axis-th dimension. + The rolling dimension will be placed at the first dimension. """ - return nputils.rolling() + axis = nputils._validate_axis(self.array, axis) + return np.swap_dims( + nputils.rolling_window(np.swap_dims(self.array, axis, -1)), + -1, axis) class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): @@ -934,5 +939,6 @@ def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) - def rolling(self, axis, window): - return NumpyIndexingAdapter(self.array.values).rolling(axis, window) + def rolling_window(self, axis, window): + return NumpyIndexingAdapter(self.array.values).rolling_window( + axis, window) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d4863014f59..87d303ad7f6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,6 +936,37 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result + def _pad(self, value=np.nan, **pad_widths): + """ + Return a new Variable with paddings. + + Parameters + ---------- + **pad_width: keyword arguments of the form {dim: (before, after)} + Number of values padded to the edges of each dimension. + + value: + Values to set the padded value. + """ + if isinstance(self.data, dask_array_type): + array = self.data + for d, pad in pad_widths.items(): + axis = self.get_axis_num(d) + before_shape = tuple(self.shape) + before_shape[axis] = pad[0] + after_shape = tuple(self.shape) + after_shape[axis] = pad[1] + array = da.concatenate([da.full(before_shape, value), + array, + da.full(after_shape, value)], + axis) + return array + else: + pads = [(0, 0) if d not in pad_widths else pad_widths[d] + for d in self.dims] + return np.pad(self.data, pads, mode='constant', + constant_values=value) + def _roll_one_dim(self, dim, count): axis = self.get_axis_num(dim) @@ -1456,6 +1487,34 @@ def rank(self, dim, pct=False): ranked /= count return Variable(self.dims, ranked) + def rolling_window(self, dim, window, window_dim): + """ + Make a rolling_window along dim and add a new_dim to the first place. + + Parameters + ---------- + dim: str + Dimension over which to compute rolling_window + window: int + Window size of the rolling + window_dim: str + New name of the rolling dimension. + + Returns + ------- + Variable that is a view of the original array with a added dimension of + size w + + Examples + -------- + >>> v=Variable(('a', 'b'), np.arange(10).reshape((2,5))) + >>> v.rolling_window(x, 'b', 3, 'window_dim') + + array([[[0, 1, 2], [1, 2, 3], [2, 3, 4]], + [[5, 6, 7], [6, 7, 8], [7, 8, 9]]]) + """ + + @property def real(self): return type(self)(self.dims, self.data.real, self._attrs) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5a89627a0f9..2e2bbe8611e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1585,6 +1585,19 @@ def assert_assigned_2d(array, key_x, key_y, values): expected = Variable(['x', 'y'], [[2, 3], [3, 4], [4, 5]]) assert_identical(v, expected) + def test_pad(self): + v = Variable(['x', 'y', 'z'], np.arange(4 * 3 * 2).reshape(4, 3, 2)) + + xr_args = [{'x': (2, 1)}, {'y': (0, 3)}, {'x': (3, 1), 'z': (2, 0)}] + np_args = [((2, 1), (0, 0), (0, 0)), ((0, 0), (0, 3), (0, 0)), + ((3, 1), (0, 0), (2, 0))] + for xr_arg, np_arg in zip(xr_args, np_args): + for value in [np.nan, 0]: + actual = v._pad(value=value, **xr_arg) + expected = np.pad(np.array(v.data), np_arg, mode='constant', + constant_values=value) + assert_array_equal(actual, expected) + @requires_dask class TestVariableWithDask(TestCase, VariableSubclassTestCases): From 52915f33145aaf9c441f3a9f577911a8b8a6c1ec Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Wed, 17 Jan 2018 20:31:04 +0900 Subject: [PATCH 03/73] Added rolling_window to DataArray and Dataset --- xarray/core/dataarray.py | 38 +++++++++++++++++++++++++++++++++++ xarray/core/dataset.py | 29 ++++++++++++++++++++++++++ xarray/core/indexing.py | 6 +++--- xarray/core/variable.py | 36 +++++++++++++++++++++++++-------- xarray/tests/test_dataset.py | 19 ++++++++++++++++++ xarray/tests/test_variable.py | 17 ++++++++++++++++ 6 files changed, 134 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8e1ec8ab7b8..40b00cef170 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2132,6 +2132,44 @@ def rank(self, dim, pct=False, keep_attrs=False): ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs) return self._from_temp_dataset(ds) + def rolling_window(self, dim, window, window_dim, center=True): + """ + Make a rolling_window along dim and add a new_dim to the last place. + + Parameters + ---------- + dim: str + Dimension over which to compute rolling_window + window: int + Window size of the rolling + window_dim: str + New name of the window dimension. + + Returns + ------- + DataArray that is a view of the original array with a added dimension + of size w + + Examples + -------- + >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) + + >>> da.rolling_window(x, 'b', 4, 'window_dim') + + array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], + [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) + Dimensions without coordinates: a, b, window_dim + + >>> da.rolling_window(x, 'b', 4, 'window_dim', center=True) + + array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], + [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) + Dimensions without coordinates: a, b, window_dim + """ + ds = self._to_temp_dataset().rolling_window(dim, window, window_dim, + center) + return self._from_temp_dataset(ds) + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 62ad2b9b653..5cb75966045 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3399,6 +3399,35 @@ def rank(self, dim, pct=False, keep_attrs=False): attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) + def rolling_window(self, dim, window, window_dim, center=True): + """ + Make a rolling_window along dim of data_vars. + + Parameters + ---------- + dim: str + Dimension over which to compute rolling_window + window: int + Window size of the rolling + window_dim: str + New name of the window dimension. + + Returns + ------- + DataArray that is a view of the original array with a added dimension + of size w + + See also + -------- + DataArray.rolling_window + """ + variables = self._variables.copy() + for k, v in self._variables.items(): + if dim in v.dims: + variables[k] = v.rolling_window(dim, window, window_dim, + center) + return self._replace_vars_and_dims(variables) + @property def real(self): return self._unary_op(lambda x: x.real, keep_attrs=True)(self) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 54021f1cb9f..cf38ff3fc66 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -826,9 +826,9 @@ def rolling_window(self, axis, window): The rolling dimension will be placed at the first dimension. """ axis = nputils._validate_axis(self.array, axis) - return np.swap_dims( - nputils.rolling_window(np.swap_dims(self.array, axis, -1)), - -1, axis) + return np.swapaxes( + nputils.rolling_window(np.swapaxes(self.array, axis, -1), window), + -2, axis) class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 87d303ad7f6..08d5aa267d4 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1487,9 +1487,9 @@ def rank(self, dim, pct=False): ranked /= count return Variable(self.dims, ranked) - def rolling_window(self, dim, window, window_dim): + def rolling_window(self, dim, window, window_dim, center=False): """ - Make a rolling_window along dim and add a new_dim to the first place. + Make a rolling_window along dim and add a new_dim to the last place. Parameters ---------- @@ -1498,22 +1498,42 @@ def rolling_window(self, dim, window, window_dim): window: int Window size of the rolling window_dim: str - New name of the rolling dimension. + New name of the window dimension. + center: boolean. default False. + If True, pad np.nan for both ends. Otherwise, pad in the head of + the axis. Returns ------- Variable that is a view of the original array with a added dimension of - size w + size w. + The return dim: self.dims + (window_dim, ) + The return shape: self.shape + (window, ) Examples -------- - >>> v=Variable(('a', 'b'), np.arange(10).reshape((2,5))) + >>> v=Variable(('a', 'b'), np.arange(8).reshape((2,4))) >>> v.rolling_window(x, 'b', 3, 'window_dim') - - array([[[0, 1, 2], [1, 2, 3], [2, 3, 4]], - [[5, 6, 7], [6, 7, 8], [7, 8, 9]]]) + + array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], + [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) + + >>> v.rolling_window(x, 'b', 3, 'window_dim', center=True) + + array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], + [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) """ + new_dims = self.dims + (window_dim, ) + if center: + start = -int(-window / 2) + end = window - 1 - start + pads = (start, end) + else: + pads = (window - 1, 0) + array = self._pad(value=np.nan, **{dim: pads}) + return Variable(new_dims, as_indexable(array).rolling_window( + self.get_axis_num(dim), window=window)) @property def real(self): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 09d67613007..fa0e5ccf8f6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4141,6 +4141,25 @@ def test_rolling_pandas_compat(center, window, min_periods): ds_rolling['index']) +@pytest.mark.parametrize('center', (True, False)) +@pytest.mark.parametrize('window', (1, 2, 3, 4)) +def test_rolling_window_pandas_compat(center, window): + df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), + 'time': np.linspace(0, 1, 20)}) + + ds = Dataset.from_dataframe(df) + df_rolling = df.rolling(window, center=center, min_periods=1).mean() + ds_rolling = ds.rolling_window(dim='index', window=window, + window_dim='window', + center=center).mean('window') + # pandas does some fancy stuff in the last position, + # we're not going to do that yet! + np.testing.assert_allclose(df_rolling['x'].values[:-1], + ds_rolling['x'].values[:-1]) + np.testing.assert_allclose(df_rolling.index, + ds_rolling['index']) + + @pytest.mark.slow @pytest.mark.parametrize('ds', (1, 2), indirect=True) @pytest.mark.parametrize('center', (True, False)) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 2e2bbe8611e..c7e38171d2d 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1598,6 +1598,19 @@ def test_pad(self): constant_values=value) assert_array_equal(actual, expected) + def test_rolling_window(self): + # Just a working test. See test_nputils fot the algorithm validation + v = Variable(['x', 'y', 'z'], np.arange(40 * 30 * 2).reshape(40, 30, + 2)) + for (d, w) in [('x', 3), ('y', 5)]: + v_rolling = v.rolling_window(d, w, d + '_window') + assert v_rolling.dims == ('x', 'y', 'z', d + '_window') + assert v_rolling.shape == v.shape + (w, ) + + v_rolling = v.rolling_window(d, w, d + '_window', center=True) + assert v_rolling.dims == ('x', 'y', 'z', d + '_window') + assert v_rolling.shape == v.shape + (w, ) + @requires_dask class TestVariableWithDask(TestCase, VariableSubclassTestCases): @@ -1639,6 +1652,10 @@ def test_getitem_with_mask_nd_indexer(self): assert_identical(v._getitem_with_mask(indexer, fill_value=-1), self.cls(('x', 'y'), [[0, -1], [-1, 2]])) + @pytest.mark.xfail + def test_rolling_window(self): + super(TestVariableWithDask, self).test_rolling_window() + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From b622007d052b2e2a2dd54173d171095d1a8e1e5a Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 18 Jan 2018 10:48:46 +0900 Subject: [PATCH 04/73] remove pad_value option. Support dask.rolling_window --- xarray/core/dataset.py | 2 +- xarray/core/indexing.py | 26 +++++++++++-- xarray/core/variable.py | 36 ++++++++++------- xarray/tests/test_dataset.py | 5 ++- xarray/tests/test_variable.py | 73 +++++++++++++++++++++-------------- 5 files changed, 92 insertions(+), 50 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5cb75966045..6563aff643e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3414,7 +3414,7 @@ def rolling_window(self, dim, window, window_dim, center=True): Returns ------- - DataArray that is a view of the original array with a added dimension + Dataset that is a view of the original array with a added dimension of size w See also diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index cf38ff3fc66..c8abbf08072 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -826,9 +826,10 @@ def rolling_window(self, axis, window): The rolling dimension will be placed at the first dimension. """ axis = nputils._validate_axis(self.array, axis) - return np.swapaxes( - nputils.rolling_window(np.swapaxes(self.array, axis, -1), window), - -2, axis) + rolling = nputils.rolling_window(np.swapaxes(self.array, axis, -1), + window) + rolling.setflags(write=False) + return np.swapaxes(rolling, -2, axis) class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): @@ -865,6 +866,25 @@ def __setitem__(self, key, value): 'into memory explicitly using the .load() ' 'method or accessing its .values attribute.') + def rolling_window(self, axis, window): + """ + Make an ndarray with a rolling window of axis-th dimension. + The rolling dimension will be placed at the first dimension. + """ + import dask.array as da + + if window < 1: + raise ValueError( + "`window` must be at least 1. Given : {}".format(window)) + if window > self.array.shape[axis]: + raise ValueError("`window` is too long. Given : {}".format(window)) + + axis = nputils._validate_axis(self.array, axis) + size = self.array.shape[axis] - window + 1 + arrays = [self.array[(slice(None), ) * axis + (slice(w, size + w), )] + for w in range(window)] + return da.stack(arrays, axis=-1) + class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 08d5aa267d4..16cd9a24410 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,7 +936,7 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result - def _pad(self, value=np.nan, **pad_widths): + def _pad(self, **pad_widths): """ Return a new Variable with paddings. @@ -944,28 +944,34 @@ def _pad(self, value=np.nan, **pad_widths): ---------- **pad_width: keyword arguments of the form {dim: (before, after)} Number of values padded to the edges of each dimension. - - value: - Values to set the padded value. """ + dtype, fill_value = dtypes.maybe_promote(self.dtype) + if isinstance(self.data, dask_array_type): array = self.data + for d, pad in pad_widths.items(): axis = self.get_axis_num(d) - before_shape = tuple(self.shape) + before_shape = list(array.shape) before_shape[axis] = pad[0] - after_shape = tuple(self.shape) + before_chunks = list(array.chunks) + before_chunks[axis] = (pad[0], ) + after_shape = list(array.shape) after_shape[axis] = pad[1] - array = da.concatenate([da.full(before_shape, value), - array, - da.full(after_shape, value)], - axis) - return array + after_chunks = list(array.chunks) + after_chunks[axis] = (pad[1], ) + array = duck_array_ops.concatenate([ + da.full(before_shape, fill_value, dtype=dtype, + chunks=before_chunks), + array, + da.full(after_shape, fill_value, dtype=dtype, + chunks=after_chunks)], axis=axis) else: pads = [(0, 0) if d not in pad_widths else pad_widths[d] for d in self.dims] - return np.pad(self.data, pads, mode='constant', - constant_values=value) + array = np.pad(self.data.astype(dtype), pads, mode='constant', + constant_values=fill_value) + return type(self)(self.dims, array) def _roll_one_dim(self, dim, count): axis = self.get_axis_num(dim) @@ -1531,8 +1537,8 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - array = self._pad(value=np.nan, **{dim: pads}) - return Variable(new_dims, as_indexable(array).rolling_window( + array = self._pad(**{dim: pads}) + return Variable(new_dims, as_indexable(array.data).rolling_window( self.get_axis_num(dim), window=window)) @property diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index fa0e5ccf8f6..c92a5fc0537 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4143,11 +4143,14 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -def test_rolling_window_pandas_compat(center, window): +@pytest.mark.parametrize('chunk', (False, True)) +def test_rolling_window_pandas_compat(center, window, chunk): df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), 'time': np.linspace(0, 1, 20)}) ds = Dataset.from_dataframe(df) + if chunk: + ds = ds.chunk({'index': 4}) df_rolling = df.rolling(window, center=center, min_periods=1).mean() ds_rolling = ds.rolling_window(dim='index', window=window, window_dim='window', diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c7e38171d2d..f75b0b9d986 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -727,6 +727,41 @@ def test_getitem_error(self): with raises_regex(IndexError, 'Dimensions of indexers mis'): v[:, ind] + def test_pad(self): + data = np.arange(4 * 3 * 2).reshape(4, 3, 2) + v = self.cls(['x', 'y', 'z'], data) + + xr_args = [{'x': (2, 1)}, {'y': (0, 3)}, {'x': (3, 1), 'z': (2, 0)}] + np_args = [((2, 1), (0, 0), (0, 0)), ((0, 0), (0, 3), (0, 0)), + ((3, 1), (0, 0), (2, 0))] + for xr_arg, np_arg in zip(xr_args, np_args): + actual = v._pad(**xr_arg) + expected = np.pad(np.array(v.data.astype(float)), np_arg, + mode='constant', constant_values=np.nan) + assert_array_equal(actual, expected) + + def test_rolling_window(self): + # Just a working test. See test_nputils fot the algorithm validation + v = self.cls(['x', 'y', 'z'], np.arange(40*30*2).reshape(40, 30, 2)) + for (d, w) in [('x', 3), ('y', 5)]: + v_rolling = v.rolling_window(d, w, d + '_window') + assert v_rolling.dims == ('x', 'y', 'z', d + '_window') + assert v_rolling.shape == v.shape + (w, ) + + v_rolling = v.rolling_window(d, w, d + '_window', center=True) + assert v_rolling.dims == ('x', 'y', 'z', d + '_window') + assert v_rolling.shape == v.shape + (w, ) + + # dask and numpy result should be the same + v_loaded = v.load().rolling_window(d, w, d + '_window', + center=True) + assert_array_equal(v_rolling, v_loaded) + + # numpy backend should not be over-written + if isinstance(v._data, np.ndarray): + with pytest.raises(ValueError): + v_loaded[0] = 1.0 + class TestVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(Variable) @@ -1585,32 +1620,6 @@ def assert_assigned_2d(array, key_x, key_y, values): expected = Variable(['x', 'y'], [[2, 3], [3, 4], [4, 5]]) assert_identical(v, expected) - def test_pad(self): - v = Variable(['x', 'y', 'z'], np.arange(4 * 3 * 2).reshape(4, 3, 2)) - - xr_args = [{'x': (2, 1)}, {'y': (0, 3)}, {'x': (3, 1), 'z': (2, 0)}] - np_args = [((2, 1), (0, 0), (0, 0)), ((0, 0), (0, 3), (0, 0)), - ((3, 1), (0, 0), (2, 0))] - for xr_arg, np_arg in zip(xr_args, np_args): - for value in [np.nan, 0]: - actual = v._pad(value=value, **xr_arg) - expected = np.pad(np.array(v.data), np_arg, mode='constant', - constant_values=value) - assert_array_equal(actual, expected) - - def test_rolling_window(self): - # Just a working test. See test_nputils fot the algorithm validation - v = Variable(['x', 'y', 'z'], np.arange(40 * 30 * 2).reshape(40, 30, - 2)) - for (d, w) in [('x', 3), ('y', 5)]: - v_rolling = v.rolling_window(d, w, d + '_window') - assert v_rolling.dims == ('x', 'y', 'z', d + '_window') - assert v_rolling.shape == v.shape + (w, ) - - v_rolling = v.rolling_window(d, w, d + '_window', center=True) - assert v_rolling.dims == ('x', 'y', 'z', d + '_window') - assert v_rolling.shape == v.shape + (w, ) - @requires_dask class TestVariableWithDask(TestCase, VariableSubclassTestCases): @@ -1652,10 +1661,6 @@ def test_getitem_with_mask_nd_indexer(self): assert_identical(v._getitem_with_mask(indexer, fill_value=-1), self.cls(('x', 'y'), [[0, -1], [-1, 2]])) - @pytest.mark.xfail - def test_rolling_window(self): - super(TestVariableWithDask, self).test_rolling_window() - class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) @@ -1754,6 +1759,14 @@ def test_getitem_fancy(self): def test_getitem_uint(self): super(TestIndexVariable, self).test_getitem_fancy() + @pytest.mark.xfail + def test_pad(self): + super(TestIndexVariable, self).test_rolling_window() + + @pytest.mark.xfail + def test_rolling_window(self): + super(TestIndexVariable, self).test_rolling_window() + class TestAsCompatibleData(TestCase): def test_unchanged_types(self): From 36a1fe9cb0dafae4abb78bf3e573920d109e1ba8 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 18 Jan 2018 17:46:45 +0900 Subject: [PATCH 05/73] Refactor rolling.reduce --- xarray/core/rolling.py | 19 ++++--------------- xarray/tests/test_nputils.py | 1 - 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 8209e70e5a8..9267a77c6d7 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -203,21 +203,10 @@ def reduce(self, func, **kwargs): Array with summarized data. """ - windows = [window.reduce(func, dim=self.dim, **kwargs) - for _, window in self] - - # Find valid windows based on count - if self.dim in self.obj.coords: - concat_dim = self.window_labels - else: - concat_dim = self.dim - counts = concat([window.count(dim=self.dim) for _, window in self], - dim=concat_dim) - result = concat(windows, dim=concat_dim) - # restore dim order - result = result.transpose(*self.obj.dims) - - result = result.where(counts >= self._min_periods) + windows = self.obj.rolling_window(self.dim, self.window, + '_rolling_window_dim', center=False) + windows = windows.reduce(func, dim='_rolling_window_dim', **kwargs) + result = windows.where(self._valid_windows) if self.center: result = self._center_result(result) diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 24dcbf0bb0a..3eb7c58157c 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -1,6 +1,5 @@ import numpy as np from numpy.testing import assert_array_equal -import pytest from xarray.core.nputils import (_is_contiguous, NumpyVIndexAdapter, rolling_window) From 71fed0fe2d8a3768fc3ccf50d5965c52c7fdd737 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 18 Jan 2018 22:54:44 +0900 Subject: [PATCH 06/73] add as_strided to npcompat. Tests added for reduce(np.nanmean) --- xarray/core/dataarray.py | 11 ++++++++--- xarray/core/dataset.py | 15 ++++++++++----- xarray/core/indexing.py | 5 ++--- xarray/core/npcompat.py | 13 +++++++++++++ xarray/core/nputils.py | 4 +++- xarray/core/rolling.py | 3 ++- xarray/tests/test_dataset.py | 12 ++++++++---- 7 files changed, 46 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 40b00cef170..c7e9e023d68 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2134,7 +2134,7 @@ def rank(self, dim, pct=False, keep_attrs=False): def rolling_window(self, dim, window, window_dim, center=True): """ - Make a rolling_window along dim and add a new_dim to the last place. + Make a sliding window along `dim` and stack along `new_dim`. Parameters ---------- @@ -2144,11 +2144,16 @@ def rolling_window(self, dim, window, window_dim, center=True): Window size of the rolling window_dim: str New name of the window dimension. + center : boolean, default False + Set the labels at the center of the window. Returns ------- - DataArray that is a view of the original array with a added dimension - of size w + DataArray that is a view of the original array. + + Note + ---- + The return array is not writeable. Examples -------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6563aff643e..b7174d76383 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3399,9 +3399,10 @@ def rank(self, dim, pct=False, keep_attrs=False): attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) - def rolling_window(self, dim, window, window_dim, center=True): + def rolling_window(self, dim, window, window_dim, center=False): """ - Make a rolling_window along dim of data_vars. + Make a sliding window along `dim` and stack along `new_dim`. + This only applies to data variables, not coordinate variables. Parameters ---------- @@ -3411,19 +3412,23 @@ def rolling_window(self, dim, window, window_dim, center=True): Window size of the rolling window_dim: str New name of the window dimension. + center : boolean, default False + Set the labels at the center of the window. Returns ------- - Dataset that is a view of the original array with a added dimension - of size w + Variables that is a view of the original data variables with a sliding + window applied. See also -------- DataArray.rolling_window + Dataset.rolling + DataArray.rolling """ variables = self._variables.copy() for k, v in self._variables.items(): - if dim in v.dims: + if dim in v.dims and k not in self._coord_names: variables[k] = v.rolling_window(dim, window, window_dim, center) return self._replace_vars_and_dims(variables) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c8abbf08072..74f268f9728 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -823,12 +823,11 @@ def __setitem__(self, key, value): def rolling_window(self, axis, window): """ Make an ndarray with a rolling window of axis-th dimension. - The rolling dimension will be placed at the first dimension. + The rolling dimension will be placed at the last dimension. """ axis = nputils._validate_axis(self.array, axis) rolling = nputils.rolling_window(np.swapaxes(self.array, axis, -1), window) - rolling.setflags(write=False) return np.swapaxes(rolling, -2, axis) @@ -869,7 +868,7 @@ def __setitem__(self, key, value): def rolling_window(self, axis, window): """ Make an ndarray with a rolling window of axis-th dimension. - The rolling dimension will be placed at the first dimension. + The rolling dimension will be placed at the last dimension. """ import dask.array as da diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index bbe7b745621..3ae58f9aeae 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -2,6 +2,19 @@ from __future__ import division from __future__ import print_function import numpy as np +from distutils.version import LooseVersion +import functools + + +if LooseVersion(np.__version__) < LooseVersion('1.12'): + def as_strided(x, shape=None, strides=None, subok=False, writeable=True): + array = np.lib.stride_tricks.as_strided(array, shape, strides, subok) + array.setflags(write=writeable) + return array + +else: + as_strided = np.lib.stride_tricks.as_strided + try: from numpy import nancumsum, nancumprod, flip diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 543390609d9..a6301baf254 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import warnings +from . import npcompat def _validate_axis(data, axis): @@ -172,4 +173,5 @@ def rolling_window(a, window): raise ValueError("`window` is too long. Given : {}".format(window)) shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + return npcompat.as_strided(a, shape=shape, strides=strides, + writeable=False) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 9267a77c6d7..3683b0a8851 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -204,7 +204,8 @@ def reduce(self, func, **kwargs): """ windows = self.obj.rolling_window(self.dim, self.window, - '_rolling_window_dim', center=False) + '_rolling_window_dim', + center=False) windows = windows.reduce(func, dim='_rolling_window_dim', **kwargs) result = windows.where(self._valid_windows) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c92a5fc0537..32b3cb2523e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4139,18 +4139,22 @@ def test_rolling_pandas_compat(center, window, min_periods): ds_rolling['x'].values[:-1]) np.testing.assert_allclose(df_rolling.index, ds_rolling['index']) + # does not use bottleneck + ds_rolling_np = ds.rolling(index=window, center=center, + min_periods=min_periods).reduce(np.nanmean) + np.testing.assert_allclose(df_rolling['x'].values[:-1], + ds_rolling_np['x'].values[:-1]) + np.testing.assert_allclose(df_rolling.index, + ds_rolling_np['index']) @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -@pytest.mark.parametrize('chunk', (False, True)) -def test_rolling_window_pandas_compat(center, window, chunk): +def test_rolling_window_pandas_compat(center, window): df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), 'time': np.linspace(0, 1, 20)}) ds = Dataset.from_dataframe(df) - if chunk: - ds = ds.chunk({'index': 4}) df_rolling = df.rolling(window, center=center, min_periods=1).mean() ds_rolling = ds.rolling_window(dim='index', window=window, window_dim='window', From 3960134e77aff352f16eaa664f7c398618c6fdb8 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Thu, 18 Jan 2018 23:55:02 +0900 Subject: [PATCH 07/73] Support boolean in maybe_promote --- xarray/core/dataarray.py | 1 + xarray/core/dataset.py | 1 + xarray/core/dtypes.py | 2 ++ xarray/core/indexing.py | 1 + 4 files changed, 5 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c7e9e023d68..9934b445874 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2171,6 +2171,7 @@ def rolling_window(self, dim, window, window_dim, center=True): [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) Dimensions without coordinates: a, b, window_dim """ + ds = self._to_temp_dataset().rolling_window(dim, window, window_dim, center) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b7174d76383..b2042859baa 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3426,6 +3426,7 @@ def rolling_window(self, dim, window, window_dim, center=False): Dataset.rolling DataArray.rolling """ + variables = self._variables.copy() for k, v in self._variables.items(): if dim in v.dims and k not in self._coord_names: diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index ccbe48edc32..e9dc8929442 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -34,6 +34,8 @@ def maybe_promote(dtype): fill_value = np.datetime64('NaT') elif np.issubdtype(dtype, np.timedelta64): fill_value = np.timedelta64('NaT') + elif dtype.kind == 'b': + fill_value = False else: dtype = object fill_value = np.nan diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 74f268f9728..3c78eb60789 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -825,6 +825,7 @@ def rolling_window(self, axis, window): Make an ndarray with a rolling window of axis-th dimension. The rolling dimension will be placed at the last dimension. """ + axis = nputils._validate_axis(self.array, axis) rolling = nputils.rolling_window(np.swapaxes(self.array, axis, -1), window) From 4bd38f33b3e3bb9bf94b91c22c1d5bcff8294f32 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Fri, 19 Jan 2018 15:10:25 +0900 Subject: [PATCH 08/73] move rolling_window into duck_array_op. Make DataArray.rolling_window public. --- xarray/core/dataarray.py | 8 +++---- xarray/core/dataset.py | 4 ++-- xarray/core/duck_array_ops.py | 22 ++++++++++++++++++++ xarray/core/indexing.py | 39 ----------------------------------- xarray/core/nputils.py | 20 ++++++++++++------ xarray/core/rolling.py | 34 ++++++++++++++++++++++++++---- xarray/core/variable.py | 8 +++---- xarray/tests/test_nputils.py | 4 ++-- xarray/tests/test_variable.py | 2 +- 9 files changed, 79 insertions(+), 62 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9934b445874..dcb2c0b6be9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2132,7 +2132,7 @@ def rank(self, dim, pct=False, keep_attrs=False): ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs) return self._from_temp_dataset(ds) - def rolling_window(self, dim, window, window_dim, center=True): + def _rolling_window(self, dim, window, window_dim, center=True): """ Make a sliding window along `dim` and stack along `new_dim`. @@ -2171,9 +2171,9 @@ def rolling_window(self, dim, window, window_dim, center=True): [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) Dimensions without coordinates: a, b, window_dim """ - - ds = self._to_temp_dataset().rolling_window(dim, window, window_dim, - center) + + ds = self._to_temp_dataset()._rolling_window(dim, window, window_dim, + center) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b2042859baa..e6400bd0af3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3399,7 +3399,7 @@ def rank(self, dim, pct=False, keep_attrs=False): attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) - def rolling_window(self, dim, window, window_dim, center=False): + def _rolling_window(self, dim, window, window_dim, center=False): """ Make a sliding window along `dim` and stack along `new_dim`. This only applies to data variables, not coordinate variables. @@ -3426,7 +3426,7 @@ def rolling_window(self, dim, window, window_dim, center=False): Dataset.rolling DataArray.rolling """ - + variables = self._variables.copy() for k, v in self._variables.items(): if dim in v.dims and k not in self._coord_names: diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 2f97de4a1ba..4e3703f274e 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -16,6 +16,7 @@ import pandas as pd from . import npcompat +from . import nputils from . import dtypes from .pycompat import dask_array_type from .nputils import nanfirst, nanlast @@ -263,3 +264,24 @@ def last(values, axis, skipna=None): _fail_on_dask_array_input_skipna(values) return nanlast(values, axis) return take(values, -1, axis=axis) + + +def rolling_window(array, axis, window): + """ + Make an ndarray with a rolling window of axis-th dimension. + The rolling dimension will be placed at the last dimension. + """ + if isinstance(array, dask_array_type): + if window < 1: + raise ValueError( + "`window` must be at least 1. Given : {}".format(window)) + if window > array.shape[axis]: + raise ValueError("`window` is too long. Given : {}".format(window)) + + axis = nputils._validate_axis(array, axis) + size = array.shape[axis] - window + 1 + arrays = [array[(slice(None), ) * axis + (slice(w, size + w), )] + for w in range(window)] + return da.stack(arrays, axis=-1) + else: # np.ndarray + return nputils.rolling_window(array, axis, window) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3c78eb60789..e06b045ad88 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -426,11 +426,6 @@ def __array__(self, dtype=None): key = BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) - def rolling_window(self, axis, window): - raise NotImplementedError('rolling_windows for {} is not implemented.' - 'Load your data first with ' - '.load() or .compute()'.format(type(self))) - def unwrap_explicit_indexer(key, target, allow): """Unwrap an explicit key into a tuple.""" @@ -820,17 +815,6 @@ def __setitem__(self, key, value): array, key = self._indexing_array_and_key(key) array[key] = value - def rolling_window(self, axis, window): - """ - Make an ndarray with a rolling window of axis-th dimension. - The rolling dimension will be placed at the last dimension. - """ - - axis = nputils._validate_axis(self.array, axis) - rolling = nputils.rolling_window(np.swapaxes(self.array, axis, -1), - window) - return np.swapaxes(rolling, -2, axis) - class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a dask array to support explicit indexing.""" @@ -866,25 +850,6 @@ def __setitem__(self, key, value): 'into memory explicitly using the .load() ' 'method or accessing its .values attribute.') - def rolling_window(self, axis, window): - """ - Make an ndarray with a rolling window of axis-th dimension. - The rolling dimension will be placed at the last dimension. - """ - import dask.array as da - - if window < 1: - raise ValueError( - "`window` must be at least 1. Given : {}".format(window)) - if window > self.array.shape[axis]: - raise ValueError("`window` is too long. Given : {}".format(window)) - - axis = nputils._validate_axis(self.array, axis) - size = self.array.shape[axis] - window + 1 - arrays = [self.array[(slice(None), ) * axis + (slice(w, size + w), )] - for w in range(window)] - return da.stack(arrays, axis=-1) - class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" @@ -958,7 +923,3 @@ def __getitem__(self, indexer): def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) - - def rolling_window(self, axis, window): - return NumpyIndexingAdapter(self.array.values).rolling_window( - axis, window) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index a6301baf254..25a118ce523 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -136,14 +136,16 @@ def __setitem__(self, key, value): mixed_positions) -def rolling_window(a, window): +def rolling_window(a, axis, window): """ - Make an ndarray with a rolling window of the last dimension + Make an ndarray with a rolling window along axis. Parameters ---------- a : array_like Array to add rolling window to + axis: int + axis position along which rolling window will be applied. window : int Size of rolling window @@ -155,23 +157,29 @@ def rolling_window(a, window): Examples -------- >>> x=np.arange(10).reshape((2,5)) - >>> np.rolling_window(x, 3) + >>> np.rolling_window(x, 3, axis=-1) array([[[0, 1, 2], [1, 2, 3], [2, 3, 4]], [[5, 6, 7], [6, 7, 8], [7, 8, 9]]]) Calculate rolling mean of last dimension: - >>> np.mean(np.rolling_window(x, 3), -1) + >>> np.mean(np.rolling_window(x, 3, axis=-1), -1) array([[ 1., 2., 3.], [ 6., 7., 8.]]) This function is taken from https://github.com/numpy/numpy/pull/31 + but slightly modified to accept axis option. """ + axis = _validate_axis(a, axis) + a = np.swapaxes(a, axis, -1) + if window < 1: raise ValueError( "`window` must be at least 1. Given : {}".format(window)) if window > a.shape[-1]: raise ValueError("`window` is too long. Given : {}".format(window)) + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) - return npcompat.as_strided(a, shape=shape, strides=strides, - writeable=False) + rolling = npcompat.as_strided(a, shape=shape, strides=strides, + writeable=False) + return np.swapaxes(rolling, -2, axis) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 3683b0a8851..a162e9efe1d 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -7,7 +7,6 @@ from .pycompat import OrderedDict, zip, dask_array_type from .common import full_like -from .combine import concat from .ops import (inject_bottleneck_rolling_methods, inject_datasetrolling_methods, has_bottleneck, bn) from .dask_array_ops import dask_rolling_wrapper @@ -95,6 +94,34 @@ def __repr__(self): def __len__(self): return self.obj.sizes[self.dim] + def construct(self, new_dim, center=None): + """ + Make an array object with rolling and stack along `new_dim`. + This only applies to only data variables, but not coordinate variables. + + Parameters + ---------- + window_dim: str + New name of the window dimension. + center: None or boolean. + If True, directly construct centered array. + If None, self.center will be used. + + Returns + ------- + Variables that is a view of the original data variables with a sliding + window applied. + + See also + -------- + DataArray.rolling_window + Dataset.rolling + DataArray.rolling + """ + center = self.center if center is None else center + return self.obj._rolling_window(self.dim, self.window, new_dim, + center) + class DataArrayRolling(Rolling): """ @@ -203,9 +230,8 @@ def reduce(self, func, **kwargs): Array with summarized data. """ - windows = self.obj.rolling_window(self.dim, self.window, - '_rolling_window_dim', - center=False) + windows = self.construct('_rolling_window_dim', + center=False) windows = windows.reduce(func, dim='_rolling_window_dim', **kwargs) result = windows.where(self._valid_windows) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 16cd9a24410..0485a0839b1 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,7 +936,7 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result - def _pad(self, **pad_widths): + def _pad_with_fill_value(self, **pad_widths): """ Return a new Variable with paddings. @@ -1537,9 +1537,9 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - array = self._pad(**{dim: pads}) - return Variable(new_dims, as_indexable(array.data).rolling_window( - self.get_axis_num(dim), window=window)) + array = self._pad_with_fill_value(**{dim: pads}) + return Variable(new_dims, duck_array_ops.rolling_window( + array.data, self.get_axis_num(dim), window=window)) @property def real(self): diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 3eb7c58157c..9821b0c0ad3 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -34,13 +34,13 @@ def test_vindex(): def test_rolling(): x = np.array([0, 1, 2, 3, 4], dtype=float) - actual = rolling_window(x, window=3) + actual = rolling_window(x, axis=-1, window=3) expected = np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4]], dtype=float) assert_array_equal(actual, expected) x = np.stack([x, x * 1.1]) - actual = rolling_window(x, window=3) + actual = rolling_window(x, axis=-1, window=3) expected = np.stack([expected, expected * 1.1], axis=0) assert_array_equal(actual, expected) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f75b0b9d986..b0c611c872a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -735,7 +735,7 @@ def test_pad(self): np_args = [((2, 1), (0, 0), (0, 0)), ((0, 0), (0, 3), (0, 0)), ((3, 1), (0, 0), (2, 0))] for xr_arg, np_arg in zip(xr_args, np_args): - actual = v._pad(**xr_arg) + actual = v._pad_with_fill_value(**xr_arg) expected = np.pad(np.array(v.data.astype(float)), np_arg, mode='constant', constant_values=np.nan) assert_array_equal(actual, expected) From af8362ed3e59957c116b8217a43f48498ad99da0 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Fri, 19 Jan 2018 23:11:52 +0900 Subject: [PATCH 09/73] Added to_dataarray and to_dataset to rolling object. --- xarray/core/dataarray.py | 44 ----------------- xarray/core/dataset.py | 35 -------------- xarray/core/npcompat.py | 3 +- xarray/core/rolling.py | 87 ++++++++++++++++++++++++++++++---- xarray/tests/test_dataarray.py | 1 + xarray/tests/test_dataset.py | 5 +- 6 files changed, 83 insertions(+), 92 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index dcb2c0b6be9..8e1ec8ab7b8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2132,50 +2132,6 @@ def rank(self, dim, pct=False, keep_attrs=False): ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs) return self._from_temp_dataset(ds) - def _rolling_window(self, dim, window, window_dim, center=True): - """ - Make a sliding window along `dim` and stack along `new_dim`. - - Parameters - ---------- - dim: str - Dimension over which to compute rolling_window - window: int - Window size of the rolling - window_dim: str - New name of the window dimension. - center : boolean, default False - Set the labels at the center of the window. - - Returns - ------- - DataArray that is a view of the original array. - - Note - ---- - The return array is not writeable. - - Examples - -------- - >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) - - >>> da.rolling_window(x, 'b', 4, 'window_dim') - - array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], - [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) - Dimensions without coordinates: a, b, window_dim - - >>> da.rolling_window(x, 'b', 4, 'window_dim', center=True) - - array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], - [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) - Dimensions without coordinates: a, b, window_dim - """ - - ds = self._to_temp_dataset()._rolling_window(dim, window, window_dim, - center) - return self._from_temp_dataset(ds) - # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e6400bd0af3..62ad2b9b653 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3399,41 +3399,6 @@ def rank(self, dim, pct=False, keep_attrs=False): attrs = self.attrs if keep_attrs else None return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) - def _rolling_window(self, dim, window, window_dim, center=False): - """ - Make a sliding window along `dim` and stack along `new_dim`. - This only applies to data variables, not coordinate variables. - - Parameters - ---------- - dim: str - Dimension over which to compute rolling_window - window: int - Window size of the rolling - window_dim: str - New name of the window dimension. - center : boolean, default False - Set the labels at the center of the window. - - Returns - ------- - Variables that is a view of the original data variables with a sliding - window applied. - - See also - -------- - DataArray.rolling_window - Dataset.rolling - DataArray.rolling - """ - - variables = self._variables.copy() - for k, v in self._variables.items(): - if dim in v.dims and k not in self._coord_names: - variables[k] = v.rolling_window(dim, window, window_dim, - center) - return self._replace_vars_and_dims(variables) - @property def real(self): return self._unary_op(lambda x: x.real, keep_attrs=True)(self) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 3ae58f9aeae..576241feea3 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -3,12 +3,11 @@ from __future__ import print_function import numpy as np from distutils.version import LooseVersion -import functools if LooseVersion(np.__version__) < LooseVersion('1.12'): def as_strided(x, shape=None, strides=None, subok=False, writeable=True): - array = np.lib.stride_tricks.as_strided(array, shape, strides, subok) + array = np.lib.stride_tricks.as_strided(x, shape, strides, subok) array.setflags(write=writeable) return array diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index a162e9efe1d..f2793050f64 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -118,6 +118,7 @@ def construct(self, new_dim, center=None): Dataset.rolling DataArray.rolling """ + center = self.center if center is None else center return self.obj._rolling_window(self.dim, self.window, new_dim, center) @@ -211,6 +212,48 @@ def _center_result(self, result): shift = (-self.window // 2) + 1 return result.shift(**{self.dim: shift}) + def to_dataarray(self, window_dim): + """ + Convert this rolling object to xr.DataArray, + where the window dimension is stacked as a new dimension + + Parameters + ---------- + window_dim: str + New name of the window dimension. + + Returns + ------- + DataArray that is a view of the original array. + + Note + ---- + The return array is not writeable. + + Examples + -------- + >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) + + >>> da.rolling_window(x, 'b', 4, 'window_dim') + + array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], + [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) + Dimensions without coordinates: a, b, window_dim + + >>> da.rolling_window(x, 'b', 4, 'window_dim', center=True) + + array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], + [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) + Dimensions without coordinates: a, b, window_dim + """ + + from .dataarray import DataArray + + window = self.obj.variable.rolling_window(self.dim, self.window, + window_dim, self.center) + return DataArray(window, dims=self.obj.dims + (window_dim,), + coords=self.obj.coords) + def reduce(self, func, **kwargs): """Reduce the items in this group by applying `func` along some dimension(s). @@ -230,15 +273,18 @@ def reduce(self, func, **kwargs): Array with summarized data. """ - windows = self.construct('_rolling_window_dim', - center=False) - windows = windows.reduce(func, dim='_rolling_window_dim', **kwargs) - result = windows.where(self._valid_windows) + windows = self.to_dataarray('_rolling_window_dim') + result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) - if self.center: - result = self._center_result(result) - - return result + # Find valid windows based on count. + # We do not use `reduced.count()` because it constructs a larger array + # (notice that `windows` is just a view) + counts = (~self.obj.isnull()).rolling( + center=self.center, **{self.dim: self.window}).to_dataarray( + '_rolling_window_dim').sum(dim='_rolling_window_dim') + result = result.where(counts >= self._min_periods) + # restore dim order + return result.transpose(*self.obj.dims) @classmethod def _reduce_method(cls, func): @@ -389,6 +435,31 @@ def wrapped_func(self, **kwargs): return Dataset(reduced, coords=self.obj.coords) return wrapped_func + def to_dataset(self, window_dim): + """ + Convert this rolling object to xr.Dataset, + where the window dimension is stacked as a new dimension + + Parameters + ---------- + window_dim: str + New name of the window dimension. + + Returns + ------- + Dataset with variables converted from rolling object. + """ + + from .dataset import Dataset + + dataset = OrderedDict() + for key, da in self.obj.data_vars.items(): + if self.dim in da.dims: + dataset[key] = self.rollings[key].to_dataarray(window_dim) + else: + dataset[key] = da + return Dataset(dataset, coords=self.obj.coords) + inject_bottleneck_rolling_methods(DataArrayRolling) inject_datasetrolling_methods(DatasetRolling) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5524b4c971d..59fde11b7c3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3296,6 +3296,7 @@ def test_rolling_iter(da): for i, (label, window_da) in enumerate(rolling_obj): assert label == da['time'].isel(time=i) + # TODO valid label seems different from that used in reduce def test_rolling_doc(da): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 32b3cb2523e..a190e5765d6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4156,9 +4156,8 @@ def test_rolling_window_pandas_compat(center, window): ds = Dataset.from_dataframe(df) df_rolling = df.rolling(window, center=center, min_periods=1).mean() - ds_rolling = ds.rolling_window(dim='index', window=window, - window_dim='window', - center=center).mean('window') + ds_rolling = ds.rolling(index=window, + center=center).to_dataset('window').mean('window') # pandas does some fancy stuff in the last position, # we're not going to do that yet! np.testing.assert_allclose(df_rolling['x'].values[:-1], From 76db6b598432f139946d17eac46f42cf30fc49d3 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 20 Jan 2018 10:06:42 +0900 Subject: [PATCH 10/73] Use pad in rolling to make compatible to pandas. Expose pad_with_fill_value to public. --- xarray/core/rolling.py | 15 ++++++++++----- xarray/core/variable.py | 4 ++-- xarray/tests/test_dataarray.py | 9 +++------ xarray/tests/test_dataset.py | 25 ++++++------------------- xarray/tests/test_variable.py | 2 +- 5 files changed, 22 insertions(+), 33 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index f2793050f64..fc6827a4972 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -316,19 +316,24 @@ def wrapped_func(self, **kwargs): axis = self.obj.get_axis_num(self.dim) - if isinstance(self.obj.data, dask_array_type): + padded = self.obj.variable + if self.center: + shift = (-self.window // 2) + 1 + padded = padded.pad_with_fill_value(**{self.dim: (0, -shift)}) + valid = (slice(None), ) * axis + (slice(-shift, None), ) + + if isinstance(padded.data, dask_array_type): values = dask_rolling_wrapper(func, self.obj.data, window=self.window, min_count=min_count, axis=axis) else: - values = func(self.obj.data, window=self.window, + values = func(padded.data, window=self.window, min_count=min_count, axis=axis) - result = DataArray(values, self.obj.coords) - if self.center: - result = self._center_result(result) + values = values[valid] + result = DataArray(values, self.obj.coords) return result return wrapped_func diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0485a0839b1..e34937eb025 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,7 +936,7 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result - def _pad_with_fill_value(self, **pad_widths): + def pad_with_fill_value(self, **pad_widths): """ Return a new Variable with paddings. @@ -1537,7 +1537,7 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - array = self._pad_with_fill_value(**{dim: pads}) + array = self.pad_with_fill_value(**{dim: pads}) return Variable(new_dims, duck_array_ops.rolling_window( array.data, self.get_axis_num(dim), window=window)) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 59fde11b7c3..c60155ba4b6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3377,12 +3377,9 @@ def test_rolling_pandas_compat(da, center, window, min_periods): min_periods=min_periods).mean() da_rolling = da.rolling(index=window, center=center, min_periods=min_periods).mean() - # pandas does some fancy stuff in the last position, - # we're not going to do that yet! - np.testing.assert_allclose(s_rolling.values[:-1], - da_rolling.values[:-1]) - np.testing.assert_allclose(s_rolling.index, - da_rolling['index']) + + np.testing.assert_allclose(s_rolling.values, da_rolling.values) + np.testing.assert_allclose(s_rolling.index, da_rolling['index']) @pytest.mark.parametrize('da', (1, 2), indirect=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a190e5765d6..816e2166bc7 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4133,19 +4133,9 @@ def test_rolling_pandas_compat(center, window, min_periods): min_periods=min_periods).mean() ds_rolling = ds.rolling(index=window, center=center, min_periods=min_periods).mean() - # pandas does some fancy stuff in the last position, - # we're not going to do that yet! - np.testing.assert_allclose(df_rolling['x'].values[:-1], - ds_rolling['x'].values[:-1]) - np.testing.assert_allclose(df_rolling.index, - ds_rolling['index']) - # does not use bottleneck - ds_rolling_np = ds.rolling(index=window, center=center, - min_periods=min_periods).reduce(np.nanmean) - np.testing.assert_allclose(df_rolling['x'].values[:-1], - ds_rolling_np['x'].values[:-1]) - np.testing.assert_allclose(df_rolling.index, - ds_rolling_np['index']) + + np.testing.assert_allclose(df_rolling['x'].values, ds_rolling['x'].values) + np.testing.assert_allclose(df_rolling.index, ds_rolling['index']) @pytest.mark.parametrize('center', (True, False)) @@ -4158,12 +4148,9 @@ def test_rolling_window_pandas_compat(center, window): df_rolling = df.rolling(window, center=center, min_periods=1).mean() ds_rolling = ds.rolling(index=window, center=center).to_dataset('window').mean('window') - # pandas does some fancy stuff in the last position, - # we're not going to do that yet! - np.testing.assert_allclose(df_rolling['x'].values[:-1], - ds_rolling['x'].values[:-1]) - np.testing.assert_allclose(df_rolling.index, - ds_rolling['index']) + + np.testing.assert_allclose(df_rolling['x'].values, ds_rolling['x'].values) + np.testing.assert_allclose(df_rolling.index, ds_rolling['index']) @pytest.mark.slow diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index b0c611c872a..9de1d4a4091 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -735,7 +735,7 @@ def test_pad(self): np_args = [((2, 1), (0, 0), (0, 0)), ((0, 0), (0, 3), (0, 0)), ((3, 1), (0, 0), (2, 0))] for xr_arg, np_arg in zip(xr_args, np_args): - actual = v._pad_with_fill_value(**xr_arg) + actual = v.pad_with_fill_value(**xr_arg) expected = np.pad(np.array(v.data.astype(float)), np_arg, mode='constant', constant_values=np.nan) assert_array_equal(actual, expected) From 87f53af6fc211e992d073f06a8e04ba008d355d3 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 12:22:17 +0900 Subject: [PATCH 11/73] Refactor rolling --- xarray/core/rolling.py | 65 +++------------------------------- xarray/tests/test_dataarray.py | 13 ++++++- 2 files changed, 16 insertions(+), 62 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index fc6827a4972..3e5498b4d8c 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -94,35 +94,6 @@ def __repr__(self): def __len__(self): return self.obj.sizes[self.dim] - def construct(self, new_dim, center=None): - """ - Make an array object with rolling and stack along `new_dim`. - This only applies to only data variables, but not coordinate variables. - - Parameters - ---------- - window_dim: str - New name of the window dimension. - center: None or boolean. - If True, directly construct centered array. - If None, self.center will be used. - - Returns - ------- - Variables that is a view of the original data variables with a sliding - window applied. - - See also - -------- - DataArray.rolling_window - Dataset.rolling - DataArray.rolling - """ - - center = self.center if center is None else center - return self.obj._rolling_window(self.dim, self.window, new_dim, - center) - class DataArrayRolling(Rolling): """ @@ -155,29 +126,18 @@ class DataArrayRolling(Rolling): def __init__(self, obj, min_periods=None, center=False, **windows): super(DataArrayRolling, self).__init__(obj, min_periods=min_periods, center=center, **windows) - self._windows = None - self._valid_windows = None self.window_indices = None self.window_labels = None self._setup_windows() - @property - def windows(self): - if self._windows is None: - self._windows = OrderedDict(zip(self.window_labels, - self.window_indices)) - return self._windows - def __iter__(self): - for (label, indices, valid) in zip(self.window_labels, - self.window_indices, - self._valid_windows): - + min_periods = self.min_periods if self.min_periods else self.window + for (label, indices) in zip(self.window_labels, self.window_indices): window = self.obj.isel(**{self.dim: indices}) - if not valid: - window = full_like(window, fill_value=True, dtype=bool) + counts = window.count(dim=self.dim) + window = window.where(counts >= self._min_periods) yield (label, window) @@ -185,33 +145,16 @@ def _setup_windows(self): """ Find the indices and labels for each window """ - from .dataarray import DataArray - self.window_labels = self.obj[self.dim] - window = int(self.window) - dim_size = self.obj[self.dim].size stops = np.arange(dim_size) + 1 starts = np.maximum(stops - window, 0) - if self._min_periods > 1: - valid_windows = (stops - starts) >= self._min_periods - else: - # No invalid windows - valid_windows = np.ones(dim_size, dtype=bool) - self._valid_windows = DataArray(valid_windows, dims=(self.dim, ), - coords=self.obj[self.dim].coords) - self.window_indices = [slice(start, stop) for start, stop in zip(starts, stops)] - def _center_result(self, result): - """center result""" - shift = (-self.window // 2) + 1 - return result.shift(**{self.dim: shift}) - def to_dataarray(self, window_dim): """ Convert this rolling object to xr.DataArray, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c60155ba4b6..d76392b2004 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3287,16 +3287,27 @@ def da_dask(seed=123): return da +@pytest.mark.parametrize('da', (1, 2), indirect=True) def test_rolling_iter(da): rolling_obj = da.rolling(time=7) + rolling_obj_mean = rolling_obj.mean() assert len(rolling_obj.window_labels) == len(da['time']) assert_identical(rolling_obj.window_labels, da['time']) for i, (label, window_da) in enumerate(rolling_obj): assert label == da['time'].isel(time=i) - # TODO valid label seems different from that used in reduce + + actual = rolling_obj_mean.isel(time=i) + expected = window_da.mean('time') + + # TODO add assert_allclose_with_nan, which compares nan position + # same nan position + assert_array_equal(actual.isnull(), expected.isnull()) + if (~actual.isnull()).sum() > 0: + np.allclose(actual.values[actual.values.nonzero()], + expected.values[expected.values.nonzero()]) def test_rolling_doc(da): From c23cedbc8e5377eaa35a537a73c08944ce7fd08c Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 13:02:07 +0900 Subject: [PATCH 12/73] flake8 --- xarray/core/rolling.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 3e5498b4d8c..94d53a5fa07 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -6,7 +6,6 @@ from distutils.version import LooseVersion from .pycompat import OrderedDict, zip, dask_array_type -from .common import full_like from .ops import (inject_bottleneck_rolling_methods, inject_datasetrolling_methods, has_bottleneck, bn) from .dask_array_ops import dask_rolling_wrapper @@ -132,7 +131,6 @@ def __init__(self, obj, min_periods=None, center=False, **windows): self._setup_windows() def __iter__(self): - min_periods = self.min_periods if self.min_periods else self.window for (label, indices) in zip(self.window_labels, self.window_indices): window = self.obj.isel(**{self.dim: indices}) From 9547c577293d430c2691732bb95590daadf1e115 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 16:06:21 +0900 Subject: [PATCH 13/73] Added a comment for dask's pad. --- xarray/core/variable.py | 2 ++ xarray/tests/test_dataarray.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e34937eb025..b67e900c35f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -950,6 +950,8 @@ def pad_with_fill_value(self, **pad_widths): if isinstance(self.data, dask_array_type): array = self.data + # Dask does not yet support pad. We manually implement it. + # https://github.com/dask/dask/issues/1926 for d, pad in pad_widths.items(): axis = self.get_axis_num(d) before_shape = list(array.shape) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d76392b2004..deb8878c5a9 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3303,7 +3303,7 @@ def test_rolling_iter(da): expected = window_da.mean('time') # TODO add assert_allclose_with_nan, which compares nan position - # same nan position + # as well as the closeness of the values. assert_array_equal(actual.isnull(), expected.isnull()) if (~actual.isnull()).sum() > 0: np.allclose(actual.values[actual.values.nonzero()], From 1f71cff372e01fd0b78b6e51cd03af193f007d26 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 16:51:27 +0900 Subject: [PATCH 14/73] Use fastpath in rolling.to_dataarray --- xarray/core/rolling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 94d53a5fa07..c04f802e454 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -192,8 +192,7 @@ def to_dataarray(self, window_dim): window = self.obj.variable.rolling_window(self.dim, self.window, window_dim, self.center) - return DataArray(window, dims=self.obj.dims + (window_dim,), - coords=self.obj.coords) + return DataArray(window, coords=self.obj.coords, fastpath=True) def reduce(self, func, **kwargs): """Reduce the items in this group by applying `func` along some From 73862eb65804e074f897c785e26c69cd1d69e6b5 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 17:27:28 +0900 Subject: [PATCH 15/73] Doc added. --- doc/computation.rst | 19 +++++++++++++++---- doc/whats-new.rst | 7 +++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 420b97923d7..b7b75e69431 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -158,13 +158,11 @@ Aggregation and summary methods can be applied directly to the ``Rolling`` objec r.mean() r.reduce(np.std) -Note that rolling window aggregations are much faster (both asymptotically and -because they avoid a loop in Python) when bottleneck_ is installed. Otherwise, -we fall back to a slower, pure Python implementation. +Note that rolling window aggregations are faster when bottleneck_ is installed. .. _bottleneck: https://github.com/kwgoodman/bottleneck/ -Finally, we can manually iterate through ``Rolling`` objects: +We can also manually iterate through ``Rolling`` objects: .. ipython:: python @@ -172,6 +170,19 @@ Finally, we can manually iterate through ``Rolling`` objects: for label, arr_window in r: # arr_window is a view of x +Finally, the rolling object has ``to_dataarray`` method, which gives a +view of the original ``DataArray`` with the windowed dimension is attached to +the last position. +You can use this for more advanced rolling operations, such as strided rolling, +windowed rolling, convolution and short-time FFT. + +.. ipython:: python + + rolling_da = r.to_dataarray('window_dim') + rolling_da + # rolling mean for every 2 points + rolling_da.isel(y=slice(None, None, 2)).mean('window_dim') + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ab3d9733856..a9425c9bb5a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,13 @@ Documentation Enhancements ~~~~~~~~~~~~ +- Improve :py:func:`~xarray.DataArray.rooling` logic for speed up. + :py:func:`~xarray.DataArrayRolling` object now support ``to_dataarray`` + method that returns a view of the DataArray object with the rolling-window + dimension added to the last position. This enables more flexible operation, + such as strided rolling, windowed rolling, ND-rolling, and convolution. + (:issue:`1831`, :issue:`1142`, :issue:`819`) + By `Keisuke Fujii `_. - Added nodatavals attribute to DataArray when using :py:func:`~xarray.open_rasterio`. (:issue:`1736`). By `Alan Snow `_. From 859bb5cffd9a9b75b64987dd14356cc812a0004f Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 20 Jan 2018 17:50:45 +0900 Subject: [PATCH 16/73] Revert not to use fastpath --- xarray/core/rolling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index c04f802e454..94d53a5fa07 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -192,7 +192,8 @@ def to_dataarray(self, window_dim): window = self.obj.variable.rolling_window(self.dim, self.window, window_dim, self.center) - return DataArray(window, coords=self.obj.coords, fastpath=True) + return DataArray(window, dims=self.obj.dims + (window_dim,), + coords=self.obj.coords) def reduce(self, func, **kwargs): """Reduce the items in this group by applying `func` along some From 05c72f0b1b470348bafceca3ccd5fde69393fc0d Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 11:20:10 +0900 Subject: [PATCH 17/73] Remove maybe_prompt for Boolean. Some improvements based on @shoyer's review. --- doc/computation.rst | 14 ++++++++++---- doc/whats-new.rst | 6 +++++- xarray/core/dtypes.py | 2 -- xarray/core/rolling.py | 15 +++++++-------- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index b7b75e69431..1dd278f4e8e 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -170,19 +170,25 @@ We can also manually iterate through ``Rolling`` objects: for label, arr_window in r: # arr_window is a view of x -Finally, the rolling object has ``to_dataarray`` method, which gives a -view of the original ``DataArray`` with the windowed dimension is attached to +Finally, the rolling object has ``to_dataarray`` method +(``to_dataset`` method for Rolling objects from Dataset), which gives a +view of the original ``DataArray`` with the windowed dimension attached to the last position. You can use this for more advanced rolling operations, such as strided rolling, -windowed rolling, convolution and short-time FFT. +windowed rolling, convolution, short-time FFT, etc. .. ipython:: python rolling_da = r.to_dataarray('window_dim') rolling_da - # rolling mean for every 2 points + # rolling mean with 2-point stride rolling_da.isel(y=slice(None, None, 2)).mean('window_dim') +Note that although the ``DataArray`` obtained by +``r.to_dataarray('window_dim')`` has an additional dimension, +it does not consume too much memory as it is just a view of +the original array. + .. _compute.broadcasting: Broadcasting by dimension name diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8ae8397e51d..6c568a319c4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,7 @@ Enhancements dimension added to the last position. This enables more flexible operation, such as strided rolling, windowed rolling, ND-rolling, and convolution. (:issue:`1831`, :issue:`1142`, :issue:`819`) -- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype`` +- reduce methods such as :py:func:`DataArray.sum()` now accept ``dtype`` arguments. (:issue:`1838`) By `Keisuke Fujii `_. - Added nodatavals attribute to DataArray when using :py:func:`~xarray.open_rasterio`. (:issue:`1736`). @@ -74,6 +74,10 @@ Enhancements Bug fixes ~~~~~~~~~ +- Rolling aggregation with ``center=True`` option now gives the same result + with pandas including the last element (:issue:`1046`). + By `Keisuke Fujii `_. + - Added warning in api.py of a netCDF4 bug that occurs when the filepath has 88 characters (:issue:`1745`). By `Liam Brannigan ` _. diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index e9dc8929442..ccbe48edc32 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -34,8 +34,6 @@ def maybe_promote(dtype): fill_value = np.datetime64('NaT') elif np.issubdtype(dtype, np.timedelta64): fill_value = np.timedelta64('NaT') - elif dtype.kind == 'b': - fill_value = False else: dtype = object fill_value = np.nan diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 94d53a5fa07..e6ea8c5d4e7 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -218,14 +218,13 @@ def reduce(self, func, **kwargs): result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) # Find valid windows based on count. - # We do not use `reduced.count()` because it constructs a larger array - # (notice that `windows` is just a view) - counts = (~self.obj.isnull()).rolling( - center=self.center, **{self.dim: self.window}).to_dataarray( - '_rolling_window_dim').sum(dim='_rolling_window_dim') - result = result.where(counts >= self._min_periods) - # restore dim order - return result.transpose(*self.obj.dims) + # The following workaround is equivalent to `windows.count()` + # but avoids to consume too much memory by using a view. + counts = ((~self.obj.isnull()).astype(float) + .rolling(center=self.center, **{self.dim: self.window}) + .to_dataarray('_rolling_window_dim') + .sum(dim='_rolling_window_dim')) + return result.where(counts > self._min_periods - 0.5) @classmethod def _reduce_method(cls, func): From d55e498eaeb7d3216910989822a16b55febaa9a7 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 13:53:50 +0900 Subject: [PATCH 18/73] Update test. --- xarray/tests/test_dataarray.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a6c0a55fbe6..50a1f5fd82e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3406,9 +3406,13 @@ def test_rolling_pandas_compat(da, center, window, min_periods): min_periods=min_periods).mean() da_rolling = da.rolling(index=window, center=center, min_periods=min_periods).mean() + da_rolling_np = da.rolling(index=window, center=center, + min_periods=min_periods).reduce(np.nanmean) np.testing.assert_allclose(s_rolling.values, da_rolling.values) np.testing.assert_allclose(s_rolling.index, da_rolling['index']) + np.testing.assert_allclose(s_rolling.values, da_rolling_np.values) + np.testing.assert_allclose(s_rolling.index, da_rolling_np['index']) @pytest.mark.parametrize('da', (1, 2), indirect=True) From 9393eb2ef74b807f7f3ee19a7c8aae7f0dde2b62 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 15:18:48 +0900 Subject: [PATCH 19/73] Bug fix in test_rolling_count_correct --- xarray/core/ops.py | 16 ++-------------- xarray/core/rolling.py | 11 +++++++---- xarray/tests/test_dataarray.py | 2 +- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/xarray/core/ops.py b/xarray/core/ops.py index d02b8fa3108..2edc0dba644 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -227,20 +227,8 @@ def func(self, *args, **kwargs): def rolling_count(rolling): - not_null = rolling.obj.notnull() - instance_attr_dict = {'center': rolling.center, - 'min_periods': rolling.min_periods, - rolling.dim: rolling.window} - rolling_count = not_null.rolling(**instance_attr_dict).sum() - - if rolling.min_periods is None: - return rolling_count - - # otherwise we need to filter out points where there aren't enough periods - # but not_null is False, and so the NaNs don't flow through - # array with points where there are enough values given min_periods - enough_periods = rolling_count >= rolling.min_periods - + rolling_count = rolling._counts() + enough_periods = rolling_count > rolling._min_periods - 0.5 return rolling_count.where(enough_periods) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index e6ea8c5d4e7..2dec0c43ebc 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -218,13 +218,16 @@ def reduce(self, func, **kwargs): result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) # Find valid windows based on count. - # The following workaround is equivalent to `windows.count()` - # but avoids to consume too much memory by using a view. - counts = ((~self.obj.isnull()).astype(float) + counts = self._counts() + return result.where(counts >= self._min_periods) + + def _counts(self): + """ Number of non-nan entries in each rolling window. """ + counts = (self.obj.notnull().astype(float) .rolling(center=self.center, **{self.dim: self.window}) .to_dataarray('_rolling_window_dim') .sum(dim='_rolling_window_dim')) - return result.where(counts > self._min_periods - 0.5) + return counts @classmethod def _reduce_method(cls, func): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 50a1f5fd82e..21607ad0d98 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3448,7 +3448,7 @@ def test_rolling_count_correct(): result = da.rolling(time=11, min_periods=None).count() expected = DataArray( [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan, 8], dims='time') + np.nan, np.nan, np.nan, np.nan, np.nan], dims='time') assert_equal(result, expected) result = da.rolling(time=7, min_periods=2).count() From 9c71a507c122e5b49eb56f2f95eb99d833b8321b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 15:45:18 +0900 Subject: [PATCH 20/73] fill_value for boolean array --- xarray/core/rolling.py | 2 +- xarray/core/variable.py | 5 ++++- xarray/tests/test_variable.py | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 2dec0c43ebc..9708cf17136 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -223,7 +223,7 @@ def reduce(self, func, **kwargs): def _counts(self): """ Number of non-nan entries in each rolling window. """ - counts = (self.obj.notnull().astype(float) + counts = (self.obj.notnull() .rolling(center=self.center, **{self.dim: self.window}) .to_dataarray('_rolling_window_dim') .sum(dim='_rolling_window_dim')) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b67e900c35f..7aa5aff88e5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -945,7 +945,10 @@ def pad_with_fill_value(self, **pad_widths): **pad_width: keyword arguments of the form {dim: (before, after)} Number of values padded to the edges of each dimension. """ - dtype, fill_value = dtypes.maybe_promote(self.dtype) + if self.dtype.kind == 'b': + dtype, fill_value = self.dtype, False + else: + dtype, fill_value = dtypes.maybe_promote(self.dtype) if isinstance(self.data, dask_array_type): array = self.data diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 9de1d4a4091..4b16c2fa7aa 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -740,6 +740,15 @@ def test_pad(self): mode='constant', constant_values=np.nan) assert_array_equal(actual, expected) + # for the boolean array, we pad False + data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) + v = self.cls(['x', 'y', 'z'], data) + for xr_arg, np_arg in zip(xr_args, np_args): + actual = v.pad_with_fill_value(**xr_arg) + expected = np.pad(np.array(v.data), np_arg, + mode='constant', constant_values=False) + assert_array_equal(actual, expected) + def test_rolling_window(self): # Just a working test. See test_nputils fot the algorithm validation v = self.cls(['x', 'y', 'z'], np.arange(40*30*2).reshape(40, 30, 2)) From 54975b49dfc79bb2eaf6d55abeb93cf0f4b4c0f0 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 16:04:59 +0900 Subject: [PATCH 21/73] rolling_window(array, axis, window) -> rolling_window(array, window, axis) --- xarray/core/duck_array_ops.py | 4 ++-- xarray/core/nputils.py | 2 +- xarray/core/variable.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index d60392ae16e..a92190db5c8 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -268,7 +268,7 @@ def last(values, axis, skipna=None): return take(values, -1, axis=axis) -def rolling_window(array, axis, window): +def rolling_window(array, window, axis=-1): """ Make an ndarray with a rolling window of axis-th dimension. The rolling dimension will be placed at the last dimension. @@ -286,4 +286,4 @@ def rolling_window(array, axis, window): for w in range(window)] return da.stack(arrays, axis=-1) else: # np.ndarray - return nputils.rolling_window(array, axis, window) + return nputils.rolling_window(array, window, axis=axis) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 25a118ce523..3e872082d39 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -136,7 +136,7 @@ def __setitem__(self, key, value): mixed_positions) -def rolling_window(a, axis, window): +def rolling_window(a, window, axis=-1): """ Make an ndarray with a rolling window along axis. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7aa5aff88e5..f78c50b7598 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1544,7 +1544,7 @@ def rolling_window(self, dim, window, window_dim, center=False): array = self.pad_with_fill_value(**{dim: pads}) return Variable(new_dims, duck_array_ops.rolling_window( - array.data, self.get_axis_num(dim), window=window)) + array.data, axis=self.get_axis_num(dim), window=window)) @property def real(self): From e907fdf29024d0fdd3fba678f3ec4e94525b03ea Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 18:35:49 +0900 Subject: [PATCH 22/73] support stride in rolling.to_dataarray --- xarray/core/ops.py | 2 +- xarray/core/rolling.py | 22 +++++++++++++++------- xarray/core/variable.py | 11 ++++++----- xarray/tests/test_dataarray.py | 25 ++++++++++++++++++++++++- xarray/tests/test_dataset.py | 19 +++++++++++++------ xarray/tests/test_variable.py | 2 +- 6 files changed, 60 insertions(+), 21 deletions(-) diff --git a/xarray/core/ops.py b/xarray/core/ops.py index 2edc0dba644..956868760f8 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -228,7 +228,7 @@ def func(self, *args, **kwargs): def rolling_count(rolling): rolling_count = rolling._counts() - enough_periods = rolling_count > rolling._min_periods - 0.5 + enough_periods = rolling_count >= rolling._min_periods return rolling_count.where(enough_periods) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 9708cf17136..844bdef5fda 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -153,7 +153,7 @@ def _setup_windows(self): self.window_indices = [slice(start, stop) for start, stop in zip(starts, stops)] - def to_dataarray(self, window_dim): + def to_dataarray(self, window_dim, stride=None): """ Convert this rolling object to xr.DataArray, where the window dimension is stacked as a new dimension @@ -162,6 +162,8 @@ def to_dataarray(self, window_dim): ---------- window_dim: str New name of the window dimension. + stride: integer, optional + size of stride for the rolling window. Returns ------- @@ -175,13 +177,15 @@ def to_dataarray(self, window_dim): -------- >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) - >>> da.rolling_window(x, 'b', 4, 'window_dim') + >>> rolling = da.rolling(a=3) + >>> rolling.to_datarray('window_dim') array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) Dimensions without coordinates: a, b, window_dim - >>> da.rolling_window(x, 'b', 4, 'window_dim', center=True) + >>> rolling = da.rolling(a=3, center=True) + >>> rolling.to_datarray('window_dim') array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) @@ -192,8 +196,9 @@ def to_dataarray(self, window_dim): window = self.obj.variable.rolling_window(self.dim, self.window, window_dim, self.center) - return DataArray(window, dims=self.obj.dims + (window_dim,), - coords=self.obj.coords) + result = DataArray(window, dims=self.obj.dims + (window_dim,), + coords=self.obj.coords) + return result.isel(**{self.dim: slice(None, None, stride)}) def reduce(self, func, **kwargs): """Reduce the items in this group by applying `func` along some @@ -383,7 +388,7 @@ def wrapped_func(self, **kwargs): return Dataset(reduced, coords=self.obj.coords) return wrapped_func - def to_dataset(self, window_dim): + def to_dataset(self, window_dim, stride=None): """ Convert this rolling object to xr.Dataset, where the window dimension is stacked as a new dimension @@ -392,6 +397,8 @@ def to_dataset(self, window_dim): ---------- window_dim: str New name of the window dimension. + stride: integer, optional + size of stride for the rolling window. Returns ------- @@ -406,7 +413,8 @@ def to_dataset(self, window_dim): dataset[key] = self.rollings[key].to_dataarray(window_dim) else: dataset[key] = da - return Dataset(dataset, coords=self.obj.coords) + return Dataset(dataset, coords=self.obj.coords).isel( + **{self.dim: slice(None, None, stride)}) inject_bottleneck_rolling_methods(DataArrayRolling) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f78c50b7598..a6a4a6a1b21 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,7 +936,7 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result - def pad_with_fill_value(self, **pad_widths): + def pad_with_fill_value(self, fill_value=None, **pad_widths): """ Return a new Variable with paddings. @@ -945,10 +945,10 @@ def pad_with_fill_value(self, **pad_widths): **pad_width: keyword arguments of the form {dim: (before, after)} Number of values padded to the edges of each dimension. """ - if self.dtype.kind == 'b': - dtype, fill_value = self.dtype, False - else: + if fill_value is None: dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = self.dtype if isinstance(self.data, dask_array_type): array = self.data @@ -1542,7 +1542,8 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - array = self.pad_with_fill_value(**{dim: pads}) + fill_value = False if self.dtype.kind == 'b' else None + array = self.pad_with_fill_value(fill_value=fill_value, **{dim: pads}) return Variable(new_dims, duck_array_ops.rolling_window( array.data, axis=self.get_axis_num(dim), window=window)) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 21607ad0d98..e6b0eccff22 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3395,7 +3395,7 @@ def test_rolling_wrapped_bottleneck_dask(da_dask, name, center, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -def test_rolling_pandas_compat(da, center, window, min_periods): +def test_rolling_pandas_compat(center, window, min_periods): s = pd.Series(range(10)) da = DataArray.from_series(s) @@ -3415,6 +3415,29 @@ def test_rolling_pandas_compat(da, center, window, min_periods): np.testing.assert_allclose(s_rolling.index, da_rolling_np['index']) +@pytest.mark.parametrize('center', (True, False)) +@pytest.mark.parametrize('window', (1, 2, 3, 4)) +def test_rolling_to_dataarray(center, window): + df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), + 'time': np.linspace(0, 1, 20)}) + + s = pd.Series(range(10)) + da = DataArray.from_series(s) + + s_rolling = s.rolling(window, center=center, min_periods=1).mean() + da_rolling = da.rolling(index=window, center=center, min_periods=1) + + da_rolling_mean = da_rolling.to_dataarray('window').mean('window') + np.testing.assert_allclose(s_rolling.values, da_rolling_mean.values) + np.testing.assert_allclose(s_rolling.index, da_rolling_mean['index']) + + # with stride + da_rolling_mean = da_rolling.to_dataarray('window', + stride=2).mean('window') + np.testing.assert_allclose(s_rolling.values[::2], da_rolling_mean.values) + np.testing.assert_allclose(s_rolling.index[::2], da_rolling_mean['index']) + + @pytest.mark.parametrize('da', (1, 2), indirect=True) @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 816e2166bc7..24537dcaa0e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4140,17 +4140,24 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -def test_rolling_window_pandas_compat(center, window): +def test_rolling_to_dataset(center, window): df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), 'time': np.linspace(0, 1, 20)}) ds = Dataset.from_dataframe(df) df_rolling = df.rolling(window, center=center, min_periods=1).mean() - ds_rolling = ds.rolling(index=window, - center=center).to_dataset('window').mean('window') - - np.testing.assert_allclose(df_rolling['x'].values, ds_rolling['x'].values) - np.testing.assert_allclose(df_rolling.index, ds_rolling['index']) + ds_rolling = ds.rolling(index=window, center=center) + + ds_rolling_mean = ds_rolling.to_dataset('window').mean('window') + np.testing.assert_allclose(df_rolling['x'].values, + ds_rolling_mean['x'].values) + np.testing.assert_allclose(df_rolling.index, ds_rolling_mean['index']) + + # with stride + ds_rolling_mean = ds_rolling.to_dataset('window', stride=2).mean('window') + np.testing.assert_allclose(df_rolling['x'][::2].values, + ds_rolling_mean['x'].values) + np.testing.assert_allclose(df_rolling.index[::2], ds_rolling_mean['index']) @pytest.mark.slow diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 4b16c2fa7aa..0c7c6ad034b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -744,7 +744,7 @@ def test_pad(self): data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) v = self.cls(['x', 'y', 'z'], data) for xr_arg, np_arg in zip(xr_args, np_args): - actual = v.pad_with_fill_value(**xr_arg) + actual = v.pad_with_fill_value(fill_value=False, **xr_arg) expected = np.pad(np.array(v.data), np_arg, mode='constant', constant_values=False) assert_array_equal(actual, expected) From 6482536baa69d2bfaf6dd6f3c929cef50f61e62c Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 19:36:32 +0900 Subject: [PATCH 23/73] flake8 --- xarray/tests/test_dataarray.py | 3 --- xarray/tests/test_dataset.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e6b0eccff22..fb6d1fcfe93 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3418,9 +3418,6 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) def test_rolling_to_dataarray(center, window): - df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), - 'time': np.linspace(0, 1, 20)}) - s = pd.Series(range(10)) da = DataArray.from_series(s) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 24537dcaa0e..afd4e0a29c4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4147,7 +4147,7 @@ def test_rolling_to_dataset(center, window): ds = Dataset.from_dataframe(df) df_rolling = df.rolling(window, center=center, min_periods=1).mean() ds_rolling = ds.rolling(index=window, center=center) - + ds_rolling_mean = ds_rolling.to_dataset('window').mean('window') np.testing.assert_allclose(df_rolling['x'].values, ds_rolling_mean['x'].values) From b8def4fb9e71389d8edbca5db7df856583e0e9d9 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 22:56:42 +0900 Subject: [PATCH 24/73] Improve doc. Add DataArrayRolling to api.rst --- doc/api.rst | 29 +++++++++++++++++++++ xarray/core/common.py | 8 +++--- xarray/core/rolling.py | 58 ++++++++++++++++++++++-------------------- 3 files changed, 65 insertions(+), 30 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 10386fe3a9b..8cf76a1a704 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -467,6 +467,35 @@ DataArray methods DataArray.load DataArray.chunk +Rolling objects +=============== + +.. autosummary:: + :toctree: generated/ + + core.rolling.DataArrayRolling + core.rolling.DataArrayRolling.to_dataarray + core.rolling.DataArrayRolling.reduce + core.rolling.DatasetRolling + core.rolling.DatasetRolling.to_dataset + core.rolling.DatasetRolling.reduce + +**Aggregation**: +:py:attr:`core.rolling.DataArrayRolling.argmax` +:py:attr:`core.rolling.DataArrayRolling.argmin` +:py:attr:`core.rolling.DataArrayRolling.max` +:py:attr:`core.rolling.DataArrayRolling.min` +:py:attr:`core.rolling.DataArrayRolling.mean` +:py:attr:`core.rolling.DataArrayRolling.prod` +:py:attr:`core.rolling.DataArrayRolling.sum` +:py:attr:`core.rolling.DataArrayRolling.std` +:py:attr:`core.rolling.DataArrayRolling.var` +:py:attr:`core.rolling.DataArrayRolling.median` +:py:attr:`core.rolling.DataArrayRolling.cumsum` +:py:attr:`core.rolling.DataArrayRolling.cumprod` +:py:attr:`core.rolling.DataArrayRolling.count` + + Plotting ======== diff --git a/xarray/core/common.py b/xarray/core/common.py index 1366d0ff03d..a48e08100ae 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -471,9 +471,6 @@ def rolling(self, min_periods=None, center=False, **windows): """ Rolling window object. - Rolling window aggregations are much faster when bottleneck is - installed. - Parameters ---------- min_periods : int, default None @@ -519,6 +516,11 @@ def rolling(self, min_periods=None, center=False, **windows): array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) Coordinates: * time (time) datetime64[ns] 2000-02-15 2000-03-15 2000-04-15 ... + + See Also + -------- + core.rolling.DataArrayRolling + core.rolling.DatasetRolling """ return self._rolling_cls(self, min_periods=min_periods, diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 844bdef5fda..bf478cee144 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -95,34 +95,38 @@ def __len__(self): class DataArrayRolling(Rolling): - """ - This class adds the following class methods; - + _reduce_method(cls, func) - + _bottleneck_reduce(cls, func) - - These class methods will be used to inject numpy or bottleneck function - by doing - - >>> func = cls._reduce_method(f) - >>> func.__name__ = name - >>> setattr(cls, name, func) - - in ops.inject_bottleneck_rolling_methods. - - After the injection, the Rolling object will have `name` (such as `mean` or - `median`) methods, - e.g. it enables the following call, - >>> data.rolling().mean() + def __init__(self, obj, min_periods=None, center=False, **windows): + """ + Moving window object for DataArray. - If bottleneck is installed, some bottleneck methods will be used instdad of - the numpy method. + Parameters + ---------- + obj : DataArray + Object to window. + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). The default, None, is equivalent to + setting min_periods equal to the size of the window. + center : boolean, default False + Set the labels at the center of the window. + **windows : dim=window + dim : str + Name of the dimension to create the rolling iterator + along (e.g., `time`). + window : int + Size of the moving window. - see also - + rolling.DataArrayRolling - + ops.inject_bottleneck_rolling_methods - """ + Returns + ------- + rolling : type of input argument - def __init__(self, obj, min_periods=None, center=False, **windows): + See Also + -------- + DataArray.rolling + DataArray.groupby + Dataset.groupby + Dataset.rolling + """ super(DataArrayRolling, self).__init__(obj, min_periods=min_periods, center=center, **windows) self.window_indices = None @@ -176,14 +180,14 @@ def to_dataarray(self, window_dim, stride=None): Examples -------- >>> da = DataArray(np.arange(8).reshape(2, 4), dims=('a', 'b')) - + >>> >>> rolling = da.rolling(a=3) >>> rolling.to_datarray('window_dim') array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) Dimensions without coordinates: a, b, window_dim - + >>> >>> rolling = da.rolling(a=3, center=True) >>> rolling.to_datarray('window_dim') From ff31589b222be718ff6b9e45c839b4e225264c33 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 22:59:30 +0900 Subject: [PATCH 25/73] Improve docs in common.rolling. --- xarray/core/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index a48e08100ae..ad7981baebc 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -488,7 +488,8 @@ def rolling(self, min_periods=None, center=False, **windows): Returns ------- - rolling : type of input argument + Rolling object (core.rolling.DataArrayRolling for DataArray, + core.rolling.DatasetRolling for Dataset.) Examples -------- From 6c011cb7317475963937935fcae602b80e44f302 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 21 Jan 2018 23:22:11 +0900 Subject: [PATCH 26/73] Expose groupby docs to public --- doc/api.rst | 25 +++++++++++-------------- xarray/core/common.py | 5 +++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 8cf76a1a704..7a64d4186f5 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -480,21 +480,18 @@ Rolling objects core.rolling.DatasetRolling.to_dataset core.rolling.DatasetRolling.reduce -**Aggregation**: -:py:attr:`core.rolling.DataArrayRolling.argmax` -:py:attr:`core.rolling.DataArrayRolling.argmin` -:py:attr:`core.rolling.DataArrayRolling.max` -:py:attr:`core.rolling.DataArrayRolling.min` -:py:attr:`core.rolling.DataArrayRolling.mean` -:py:attr:`core.rolling.DataArrayRolling.prod` -:py:attr:`core.rolling.DataArrayRolling.sum` -:py:attr:`core.rolling.DataArrayRolling.std` -:py:attr:`core.rolling.DataArrayRolling.var` -:py:attr:`core.rolling.DataArrayRolling.median` -:py:attr:`core.rolling.DataArrayRolling.cumsum` -:py:attr:`core.rolling.DataArrayRolling.cumprod` -:py:attr:`core.rolling.DataArrayRolling.count` +GroupByObjects +============== + +.. autosummary:: + :toctree: generated/ + core.groupby.DataArrayGroupBy + core.groupby.DataArrayGroupBy.apply + core.groupby.DataArrayGroupBy.reduce + core.groupby.DatasetGroupBy + core.groupby.DatasetGroupBy.apply + core.groupby.DatasetGroupBy.reduce Plotting ======== diff --git a/xarray/core/common.py b/xarray/core/common.py index ad7981baebc..4b42f573bf5 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -412,6 +412,11 @@ def groupby(self, group, squeeze=True): grouped : GroupBy A `GroupBy` object patterned after `pandas.GroupBy` that can be iterated over in the form of `(unique_value, grouped_array)` pairs. + + See Also + -------- + core.groupby.DataArrayGroupBy + core.groupby.DatasetGroupBy """ return self._groupby_cls(self, group, squeeze=squeeze) From 684145abd62b5c5dd36a98d21f10b3016292567e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 22 Jan 2018 08:18:10 +0900 Subject: [PATCH 27/73] Default fill_value=dtypes.NA, stride=1. Add comment for DataArrayRollig. --- xarray/core/rolling.py | 10 ++++++---- xarray/core/variable.py | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index bf478cee144..dc2ed0f9c6f 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -98,6 +98,8 @@ class DataArrayRolling(Rolling): def __init__(self, obj, min_periods=None, center=False, **windows): """ Moving window object for DataArray. + You should use the DataArray.rolling() method to construct this object + instead of the class constructor. Parameters ---------- @@ -124,8 +126,8 @@ def __init__(self, obj, min_periods=None, center=False, **windows): -------- DataArray.rolling DataArray.groupby - Dataset.groupby Dataset.rolling + Dataset.groupby """ super(DataArrayRolling, self).__init__(obj, min_periods=min_periods, center=center, **windows) @@ -157,7 +159,7 @@ def _setup_windows(self): self.window_indices = [slice(start, stop) for start, stop in zip(starts, stops)] - def to_dataarray(self, window_dim, stride=None): + def to_dataarray(self, window_dim, stride=1): """ Convert this rolling object to xr.DataArray, where the window dimension is stacked as a new dimension @@ -167,7 +169,7 @@ def to_dataarray(self, window_dim, stride=None): window_dim: str New name of the window dimension. stride: integer, optional - size of stride for the rolling window. + Size of stride for the rolling window. Returns ------- @@ -392,7 +394,7 @@ def wrapped_func(self, **kwargs): return Dataset(reduced, coords=self.obj.coords) return wrapped_func - def to_dataset(self, window_dim, stride=None): + def to_dataset(self, window_dim, stride=1): """ Convert this rolling object to xr.Dataset, where the window dimension is stacked as a new dimension diff --git a/xarray/core/variable.py b/xarray/core/variable.py index a6a4a6a1b21..9dded61d03f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -936,7 +936,7 @@ def shift(self, **shifts): result = result._shift_one_dim(dim, count) return result - def pad_with_fill_value(self, fill_value=None, **pad_widths): + def pad_with_fill_value(self, fill_value=dtypes.NA, **pad_widths): """ Return a new Variable with paddings. @@ -945,7 +945,7 @@ def pad_with_fill_value(self, fill_value=None, **pad_widths): **pad_width: keyword arguments of the form {dim: (before, after)} Number of values padded to the edges of each dimension. """ - if fill_value is None: + if fill_value is dtypes.NA: # np.nan is passed dtype, fill_value = dtypes.maybe_promote(self.dtype) else: dtype = self.dtype @@ -1542,7 +1542,7 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - fill_value = False if self.dtype.kind == 'b' else None + fill_value = False if self.dtype.kind == 'b' else dtypes.NA array = self.pad_with_fill_value(fill_value=fill_value, **{dim: pads}) return Variable(new_dims, duck_array_ops.rolling_window( array.data, axis=self.get_axis_num(dim), window=window)) From 3a7526e46ce07cbb4f3d159d6b617284bfd57676 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 22 Jan 2018 08:20:53 +0900 Subject: [PATCH 28/73] Default fill_value=dtypes.NA, stride=1. Add comment for DataArrayRollig. --- xarray/core/rolling.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index dc2ed0f9c6f..c27c69a8f15 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -98,7 +98,7 @@ class DataArrayRolling(Rolling): def __init__(self, obj, min_periods=None, center=False, **windows): """ Moving window object for DataArray. - You should use the DataArray.rolling() method to construct this object + You should use DataArray.rolling() method to construct this object instead of the class constructor. Parameters @@ -294,26 +294,11 @@ def wrapped_func(self, **kwargs): class DatasetRolling(Rolling): - """An object that implements the moving window pattern for Dataset. - - This class has an OrderedDict named self.rollings, that is a collection of - DataArrayRollings for all the DataArrays in the Dataset, except for those - not depending on rolling dimension. - - reduce() method returns a new Dataset generated from a set of - self.rollings[key].reduce(). - - See Also - -------- - Dataset.groupby - DataArray.groupby - Dataset.rolling - DataArray.rolling - """ - def __init__(self, obj, min_periods=None, center=False, **windows): """ Moving window object for Dataset. + You should use Dataset.rolling() method to construct this object + instead of the class constructor. Parameters ---------- @@ -335,6 +320,13 @@ def __init__(self, obj, min_periods=None, center=False, **windows): Returns ------- rolling : type of input argument + + See Also + -------- + Dataset.rolling + DataArray.rolling + Dataset.groupby + DataArray.groupby """ super(DatasetRolling, self).__init__(obj, min_periods, center, **windows) From a0968d648f636da8aa2f849513db6216eef63f81 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 22 Jan 2018 10:55:39 +0900 Subject: [PATCH 29/73] Add fill_value option to rolling.to_dataarray --- xarray/core/rolling.py | 17 ++++++++++++----- xarray/core/variable.py | 5 +++-- xarray/tests/test_dataarray.py | 6 ++++++ xarray/tests/test_dataset.py | 5 +++++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index c27c69a8f15..6011adb0429 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -9,6 +9,7 @@ from .ops import (inject_bottleneck_rolling_methods, inject_datasetrolling_methods, has_bottleneck, bn) from .dask_array_ops import dask_rolling_wrapper +from . import dtypes class Rolling(object): @@ -159,7 +160,7 @@ def _setup_windows(self): self.window_indices = [slice(start, stop) for start, stop in zip(starts, stops)] - def to_dataarray(self, window_dim, stride=1): + def to_dataarray(self, window_dim, stride=1, fill_value=dtypes.NA): """ Convert this rolling object to xr.DataArray, where the window dimension is stacked as a new dimension @@ -170,6 +171,8 @@ def to_dataarray(self, window_dim, stride=1): New name of the window dimension. stride: integer, optional Size of stride for the rolling window. + fill_value: optional. Default dtypes.NA + Filling value to match the dimension size. Returns ------- @@ -201,7 +204,8 @@ def to_dataarray(self, window_dim, stride=1): from .dataarray import DataArray window = self.obj.variable.rolling_window(self.dim, self.window, - window_dim, self.center) + window_dim, self.center, + fill_value=fill_value) result = DataArray(window, dims=self.obj.dims + (window_dim,), coords=self.obj.coords) return result.isel(**{self.dim: slice(None, None, stride)}) @@ -236,7 +240,7 @@ def _counts(self): """ Number of non-nan entries in each rolling window. """ counts = (self.obj.notnull() .rolling(center=self.center, **{self.dim: self.window}) - .to_dataarray('_rolling_window_dim') + .to_dataarray('_rolling_window_dim', fill_value=False) .sum(dim='_rolling_window_dim')) return counts @@ -386,7 +390,7 @@ def wrapped_func(self, **kwargs): return Dataset(reduced, coords=self.obj.coords) return wrapped_func - def to_dataset(self, window_dim, stride=1): + def to_dataset(self, window_dim, stride=1, fill_value=dtypes.NA): """ Convert this rolling object to xr.Dataset, where the window dimension is stacked as a new dimension @@ -397,6 +401,8 @@ def to_dataset(self, window_dim, stride=1): New name of the window dimension. stride: integer, optional size of stride for the rolling window. + fill_value: optional. Default dtypes.NA + Filling value to match the dimension size. Returns ------- @@ -408,7 +414,8 @@ def to_dataset(self, window_dim, stride=1): dataset = OrderedDict() for key, da in self.obj.data_vars.items(): if self.dim in da.dims: - dataset[key] = self.rollings[key].to_dataarray(window_dim) + dataset[key] = self.rollings[key].to_dataarray( + window_dim, fill_value=fill_value) else: dataset[key] = da return Dataset(dataset, coords=self.obj.coords).isel( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9dded61d03f..38d3b00158d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1498,7 +1498,8 @@ def rank(self, dim, pct=False): ranked /= count return Variable(self.dims, ranked) - def rolling_window(self, dim, window, window_dim, center=False): + def rolling_window(self, dim, window, window_dim, center=False, + fill_value=dtypes.NA): """ Make a rolling_window along dim and add a new_dim to the last place. @@ -1514,6 +1515,7 @@ def rolling_window(self, dim, window, window_dim, center=False): If True, pad np.nan for both ends. Otherwise, pad in the head of the axis. + Returns ------- Variable that is a view of the original array with a added dimension of @@ -1542,7 +1544,6 @@ def rolling_window(self, dim, window, window_dim, center=False): else: pads = (window - 1, 0) - fill_value = False if self.dtype.kind == 'b' else dtypes.NA array = self.pad_with_fill_value(fill_value=fill_value, **{dim: pads}) return Variable(new_dims, duck_array_ops.rolling_window( array.data, axis=self.get_axis_num(dim), window=window)) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fb6d1fcfe93..6fdf182db61 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3434,6 +3434,12 @@ def test_rolling_to_dataarray(center, window): np.testing.assert_allclose(s_rolling.values[::2], da_rolling_mean.values) np.testing.assert_allclose(s_rolling.index[::2], da_rolling_mean['index']) + # with fill_value + da_rolling_mean = da_rolling.to_dataarray( + 'window', stride=2, fill_value=0.0).mean('window') + assert da_rolling_mean.isnull().sum() == 0 + assert (da_rolling_mean == 0.0).sum() >= 0 + @pytest.mark.parametrize('da', (1, 2), indirect=True) @pytest.mark.parametrize('center', (True, False)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index afd4e0a29c4..0c07ceb7d25 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4158,6 +4158,11 @@ def test_rolling_to_dataset(center, window): np.testing.assert_allclose(df_rolling['x'][::2].values, ds_rolling_mean['x'].values) np.testing.assert_allclose(df_rolling.index[::2], ds_rolling_mean['index']) + # with fill_value + ds_rolling_mean = ds_rolling.to_dataset( + 'window', stride=2, fill_value=0.0).mean('window') + assert ds_rolling_mean.isnull().sum() == 0 + assert (ds_rolling_mean['x'] == 0.0).sum() >= 0 @pytest.mark.slow From ac4f00e18527357f0546778e21612eb89f77cd25 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 22 Jan 2018 13:27:11 +0900 Subject: [PATCH 30/73] Convert non-numeric array in reduce. --- xarray/core/rolling.py | 7 +++++++ xarray/tests/test_dataarray.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6011adb0429..6d3eb7f6a87 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -228,6 +228,13 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ + # Reduce functions usually assumes numeric type. + # For non-number array such as bool, We cast them to float + if self.obj.dtype.kind not in 'iufcm': + return DataArrayRolling( + self.obj.astype(float), center=self.center, + min_periods=self.min_periods, + **{self.dim: self.window}).reduce(func, **kwargs) windows = self.to_dataarray('_rolling_window_dim') result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 6fdf182db61..5e355a379df 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3293,6 +3293,11 @@ def da(request): [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims='time') + if request.param == 3: # boolean array + return DataArray( + [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], + dims='time').isnull() + @pytest.fixture def da_dask(seed=123): @@ -3441,7 +3446,7 @@ def test_rolling_to_dataarray(center, window): assert (da_rolling_mean == 0.0).sum() >= 0 -@pytest.mark.parametrize('da', (1, 2), indirect=True) +@pytest.mark.parametrize('da', (1, 2, 3), indirect=True) @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) From fbfc2627ce9cb83f6d69689cb673ca8fd24dd15c Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 22 Jan 2018 13:45:10 +0900 Subject: [PATCH 31/73] Fill_value = False for boolean array in rolling.reduce --- xarray/core/rolling.py | 12 +++++------- xarray/tests/test_dataarray.py | 22 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6d3eb7f6a87..6f9a822aafd 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -228,15 +228,13 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ + fill_value = dtypes.NA # Reduce functions usually assumes numeric type. # For non-number array such as bool, We cast them to float - if self.obj.dtype.kind not in 'iufcm': - return DataArrayRolling( - self.obj.astype(float), center=self.center, - min_periods=self.min_periods, - **{self.dim: self.window}).reduce(func, **kwargs) - - windows = self.to_dataarray('_rolling_window_dim') + if self.obj.dtype.kind == 'b': + fill_value = False + windows = self.to_dataarray('_rolling_window_dim', + fill_value=fill_value) result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) # Find valid windows based on count. diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5e355a379df..468b9e2eebe 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3446,7 +3446,7 @@ def test_rolling_to_dataarray(center, window): assert (da_rolling_mean == 0.0).sum() >= 0 -@pytest.mark.parametrize('da', (1, 2, 3), indirect=True) +@pytest.mark.parametrize('da', (1, 2), indirect=True) @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) @@ -3466,6 +3466,26 @@ def test_rolling_reduce(da, center, min_periods, window, name): assert actual.dims == expected.dims +@pytest.mark.parametrize('da', (3, ), indirect=True) +@pytest.mark.parametrize('center', (True, False)) +@pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) +@pytest.mark.parametrize('window', (1, 2, 3, 4)) +@pytest.mark.parametrize('name', ('sum', 'max')) +def test_rolling_reduce_nonnumeric(da, center, min_periods, window, name): + + if min_periods is not None and window < min_periods: + min_periods = window + + rolling_obj = da.rolling(time=window, center=center, + min_periods=min_periods) + + # add nan prefix to numpy methods to get similar # behavior as bottleneck + actual = rolling_obj.reduce(getattr(np, 'nan%s' % name)) + expected = getattr(rolling_obj, name)() + assert_allclose(actual, expected) + assert actual.dims == expected.dims + + def test_rolling_count_correct(): da = DataArray( From c75798623938e21b539a7c7969817733bf290a1e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 22 Jan 2018 23:23:47 +0900 Subject: [PATCH 32/73] Support old numpy plus bottleneck combination. Suppress warning for all-nan slice reduce. --- xarray/core/dtypes.py | 10 ++++++++++ xarray/core/rolling.py | 12 +++++++----- xarray/tests/test_dataarray.py | 14 +++++++------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index ccbe48edc32..2a7f9ccf0ce 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -60,3 +60,13 @@ def is_datetime_like(dtype): """ return (np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)) + + +def reduceable_fill_value(dtype): + """ Fill value that can be calculated with dtype. """ + if dtype.kind == 'b': + return False + promoted_dtype, _ = maybe_promote(dtype) + if promoted_dtype == object: + raise TypeError('dtype {} is not reduceable.'.format(dtype)) + return NA diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6f9a822aafd..8464109672d 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -228,11 +228,7 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ - fill_value = dtypes.NA - # Reduce functions usually assumes numeric type. - # For non-number array such as bool, We cast them to float - if self.obj.dtype.kind == 'b': - fill_value = False + fill_value = dtypes.reduceable_fill_value(self.obj.dtype) windows = self.to_dataarray('_rolling_window_dim', fill_value=fill_value) result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) @@ -282,6 +278,12 @@ def wrapped_func(self, **kwargs): padded = self.obj.variable if self.center: shift = (-self.window // 2) + 1 + + if (LooseVersion(np.__version__) < LooseVersion('1.13') and + self.obj.dtype.kind == 'b'): + # with numpy < 1.13 bottleneck cannot handle np.nan-Boolean + # mixed array correctly. We cast boolean array to float. + padded = padded.astype(float) padded = padded.pad_with_fill_value(**{self.dim: (0, -shift)}) valid = (slice(None), ) * axis + (slice(-shift, None), ) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 468b9e2eebe..63884fd342d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3293,11 +3293,6 @@ def da(request): [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims='time') - if request.param == 3: # boolean array - return DataArray( - [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], - dims='time').isnull() - @pytest.fixture def da_dask(seed=123): @@ -3456,6 +3451,10 @@ def test_rolling_reduce(da, center, min_periods, window, name): if min_periods is not None and window < min_periods: min_periods = window + if da.isnull().sum() > 1 and window == 1: + # this causes all nan slices + window = 2 + rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) @@ -3466,12 +3465,13 @@ def test_rolling_reduce(da, center, min_periods, window, name): assert actual.dims == expected.dims -@pytest.mark.parametrize('da', (3, ), indirect=True) @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) @pytest.mark.parametrize('name', ('sum', 'max')) -def test_rolling_reduce_nonnumeric(da, center, min_periods, window, name): +def test_rolling_reduce_nonnumeric(center, min_periods, window, name): + da = DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], + dims='time').isnull() if min_periods is not None and window < min_periods: min_periods = window From 8fd5fa320ddcc027d07432667bbe39c61831bc99 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Mon, 22 Jan 2018 23:36:28 +0900 Subject: [PATCH 33/73] flake8 --- xarray/tests/test_dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 63884fd342d..28dac45e923 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3471,7 +3471,7 @@ def test_rolling_reduce(da, center, min_periods, window, name): @pytest.mark.parametrize('name', ('sum', 'max')) def test_rolling_reduce_nonnumeric(center, min_periods, window, name): da = DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], - dims='time').isnull() + dims='time').isnull() if min_periods is not None and window < min_periods: min_periods = window From ade5ba2ab73c2e15fc76478c4366da5610a5bde6 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 23 Jan 2018 08:44:36 +0900 Subject: [PATCH 34/73] Add benchmark --- asv_bench/benchmarks/rolling.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 asv_bench/benchmarks/rolling.py diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 00000000000..db25e9c06e4 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,46 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import xarray as xr + +from . import randn + + +nx = 3000 +ny = 2000 +nt = 1000 +ds = xr.Dataset({'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), + 'var2': (('x', 't'), randn((nx, nt))), + 'var3': (('t', ), randn(nt))}, + coords={'x': np.arange(nx), + 'y': np.linspace(0, 1, ny), + 't': pd.date_range('1970-01-01', periods=nt, freq='D'), + 'x_coords': ('x', np.linspace(1.1, 2.1, nx))}) + + +def time_rolling(func, center): + getattr(ds.rolling(x=20, center=center), func)() + + +time_rolling.param_names = ['func', 'center'] +time_rolling.params = (['mean', 'count'], [True, False]) + + +def time_rolling_np(func, center): + ds.rolling(x=20, center=center).reduce(getattr(np, 'nan{}'.format(func))) + + +time_rolling_np.param_names = ['func', 'center'] +time_rolling_np.params = (['mean', ], [True, False]) + + +def time_rolling_to_dataset(center, stride): + ds.rolling(x=20, center=center).to_dataset( + 'window_dim', stride=stride).mean(dim='window_dim') + + +time_rolling_to_dataset.param_names = ['center', 'stride'] +time_rolling_to_dataset.params = ([True, False], [1, 200]) From 2d6897fb66cc0ce766a148595fd36bca6b527a0e Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 23 Jan 2018 09:02:15 +0900 Subject: [PATCH 35/73] Dataset.count. Benchmark --- asv_bench/benchmarks/rolling.py | 14 ++++++++------ xarray/core/rolling.py | 10 ++++++++++ xarray/tests/test_dataarray.py | 32 +++++++++++++++++--------------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index db25e9c06e4..ae8211a6028 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -12,6 +12,7 @@ nx = 3000 ny = 2000 nt = 1000 +window = 20 ds = xr.Dataset({'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), 'var2': (('x', 't'), randn((nx, nt))), 'var3': (('t', ), randn(nt))}, @@ -22,23 +23,24 @@ def time_rolling(func, center): - getattr(ds.rolling(x=20, center=center), func)() + getattr(ds.rolling(x=window, center=center), func)() time_rolling.param_names = ['func', 'center'] time_rolling.params = (['mean', 'count'], [True, False]) -def time_rolling_np(func, center): - ds.rolling(x=20, center=center).reduce(getattr(np, 'nan{}'.format(func))) +def time_rolling_np(window_, min_periods): + ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( + getattr(np, 'nanmean')) -time_rolling_np.param_names = ['func', 'center'] -time_rolling_np.params = (['mean', ], [True, False]) +time_rolling_np.param_names = ['window_', 'min_periods'] +time_rolling_np.params = ([20, 40], [5, None]) def time_rolling_to_dataset(center, stride): - ds.rolling(x=20, center=center).to_dataset( + ds.rolling(x=window, center=center).to_dataset( 'window_dim', stride=stride).mean(dim='window_dim') diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 8464109672d..9f24b78106a 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -378,6 +378,16 @@ def reduce(self, func, **kwargs): reduced[key] = self.obj[key] return Dataset(reduced, coords=self.obj.coords) + def _counts(self): + from .dataset import Dataset + reduced = OrderedDict() + for key, da in self.obj.data_vars.items(): + if self.dim in da.dims: + reduced[key] = self.rollings[key]._counts() + else: + reduced[key] = self.obj[key] + return Dataset(reduced, coords=self.obj.coords) + @classmethod def _reduce_method(cls, func): """ diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 28dac45e923..b34662bd2ca 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3491,21 +3491,23 @@ def test_rolling_count_correct(): da = DataArray( [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims='time') - result = da.rolling(time=11, min_periods=1).count() - expected = DataArray( - [1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8], dims='time') - assert_equal(result, expected) - - result = da.rolling(time=11, min_periods=None).count() - expected = DataArray( - [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan, np.nan], dims='time') - assert_equal(result, expected) - - result = da.rolling(time=7, min_periods=2).count() - expected = DataArray( - [np.nan, np.nan, 2, 3, 3, 4, 5, 5, 5, 5, 5], dims='time') - assert_equal(result, expected) + kwargs = [{'time': 11, 'min_periods': 1}, + {'time': 11, 'min_periods': None}, + {'time': 7, 'min_periods': 2}] + expecteds = [DataArray( + [1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8], dims='time'), + DataArray( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], dims='time'), + DataArray( + [np.nan, np.nan, 2, 3, 3, 4, 5, 5, 5, 5, 5], dims='time')] + + for kwarg, expected in zip(kwargs, expecteds): + result = da.rolling(**kwarg).count() + assert_equal(result, expected) + + result = da.to_dataset(name='var1').rolling(**kwarg).count()['var1'] + assert_equal(result, expected) def test_raise_no_warning_for_nan_in_binary_ops(): From 6461f84ca1740f6991b6dd38a2693dbae4ec4639 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 24 Jan 2018 07:50:25 +0900 Subject: [PATCH 36/73] Classize benchmark --- asv_bench/benchmarks/rolling.py | 55 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index ae8211a6028..0093fc4318c 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -6,43 +6,48 @@ import pandas as pd import xarray as xr -from . import randn - +from . import randn, requires_dask nx = 3000 ny = 2000 nt = 1000 window = 20 -ds = xr.Dataset({'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), - 'var2': (('x', 't'), randn((nx, nt))), - 'var3': (('t', ), randn(nt))}, - coords={'x': np.arange(nx), - 'y': np.linspace(0, 1, ny), - 't': pd.date_range('1970-01-01', periods=nt, freq='D'), - 'x_coords': ('x', np.linspace(1.1, 2.1, nx))}) - - -def time_rolling(func, center): - getattr(ds.rolling(x=window, center=center), func)() -time_rolling.param_names = ['func', 'center'] -time_rolling.params = (['mean', 'count'], [True, False]) +class Rolling(object): + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + {'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), + 'var2': (('x', 't'), randn((nx, nt))), + 'var3': (('t', ), randn(nt))}, + coords={'x': np.arange(nx), + 'y': np.linspace(0, 1, ny), + 't': pd.date_range('1970-01-01', periods=nt, freq='D'), + 'x_coords': ('x', np.linspace(1.1, 2.1, nx))}) + def time_rolling(self, func, center): + getattr(self.ds.rolling(x=window, center=center), func)() -def time_rolling_np(window_, min_periods): - ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( - getattr(np, 'nanmean')) + time_rolling.param_names = ['func', 'center'] + time_rolling.params = (['mean', 'count'], [True, False]) + def time_rolling_np(self, window_, min_periods): + self.ds.rolling(x=window_, center=False, + min_periods=min_periods).reduce(getattr(np, 'nanmean')) -time_rolling_np.param_names = ['window_', 'min_periods'] -time_rolling_np.params = ([20, 40], [5, None]) + time_rolling_np.param_names = ['window_', 'min_periods'] + time_rolling_np.params = ([20, 40], [5, None]) + def time_rolling_to_dataset(self, center, stride): + self.ds.rolling(x=window, center=center).to_dataset( + 'window_dim', stride=stride).mean(dim='window_dim') -def time_rolling_to_dataset(center, stride): - ds.rolling(x=window, center=center).to_dataset( - 'window_dim', stride=stride).mean(dim='window_dim') + time_rolling_to_dataset.param_names = ['center', 'stride'] + time_rolling_to_dataset.params = ([True, False], [1, 200]) -time_rolling_to_dataset.param_names = ['center', 'stride'] -time_rolling_to_dataset.params = ([True, False], [1, 200]) +class RollingDask(Rolling): + def setup(self, *args, **kwargs): + requires_dask() + super(RollingDask, self).setup(**kwargs) + self.ds = self.ds.chunk({'x': 100, 'y': 50, 't': 50}) From aece1c4b48d5e3fb6a71bde009dface919cf5722 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 24 Jan 2018 12:22:19 +0900 Subject: [PATCH 37/73] Decoratorize for asv benchmark --- asv_bench/benchmarks/__init__.py | 8 ++++++++ asv_bench/benchmarks/rolling.py | 17 +++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index 21ee86e28e3..c9611e4c581 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -10,6 +10,14 @@ _counter = itertools.count() +def parameterized(names, params): + def decorator(func): + func.param_names = names + func.params = params + return func + return decorator + + def requires_dask(): try: import dask diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 0093fc4318c..c28e5a54f2a 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -6,7 +6,7 @@ import pandas as pd import xarray as xr -from . import randn, requires_dask +from . import parameterized, randn, requires_dask nx = 3000 ny = 2000 @@ -25,26 +25,23 @@ def setup(self, *args, **kwargs): 't': pd.date_range('1970-01-01', periods=nt, freq='D'), 'x_coords': ('x', np.linspace(1.1, 2.1, nx))}) + @parameterized(['func', 'center'], + (['mean', 'count'], [True, False])) def time_rolling(self, func, center): getattr(self.ds.rolling(x=window, center=center), func)() - time_rolling.param_names = ['func', 'center'] - time_rolling.params = (['mean', 'count'], [True, False]) - + @parameterized(['window_', 'min_periods'], + ([20, 40], [5, None])) def time_rolling_np(self, window_, min_periods): self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce(getattr(np, 'nanmean')) - time_rolling_np.param_names = ['window_', 'min_periods'] - time_rolling_np.params = ([20, 40], [5, None]) - + @parameterized(['center', 'stride'], + ([True, False], [1, 200])) def time_rolling_to_dataset(self, center, stride): self.ds.rolling(x=window, center=center).to_dataset( 'window_dim', stride=stride).mean(dim='window_dim') - time_rolling_to_dataset.param_names = ['center', 'stride'] - time_rolling_to_dataset.params = ([True, False], [1, 200]) - class RollingDask(Rolling): def setup(self, *args, **kwargs): From 4189d71d9998ff83deb9a5d7035a2edaf628ae25 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Wed, 24 Jan 2018 18:55:44 +0900 Subject: [PATCH 38/73] Classize benchmarks/indexing.py --- asv_bench/benchmarks/__init__.py | 1 - asv_bench/benchmarks/indexing.py | 28 ++++++++-------------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index d84eda387be..dd00ffbb037 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -2,7 +2,6 @@ from __future__ import division from __future__ import print_function import itertools -import random import numpy as np diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index e9a85115a49..0887fb52f57 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -6,7 +6,7 @@ import pandas as pd import xarray as xr -from . import randn, randint, requires_dask +from . import parameterized, randn, randint, requires_dask nx = 3000 @@ -61,7 +61,7 @@ class Base(object): - def setup(self, key): + def setup(self, *args, **kwargs): self.ds = xr.Dataset( {'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), 'var2': (('x', 't'), randn((nx, nt))), @@ -73,53 +73,41 @@ def setup(self, key): class Indexing(Base): + @parameterized(['key'], (list(basic_indexes.keys()))) def time_indexing_basic(self, key): self.ds.isel(**basic_indexes[key]).load() - time_indexing_basic.param_names = ['key'] - time_indexing_basic.params = [list(basic_indexes.keys())] - + @parameterized(['key'], (list(outer_indexes.keys()))) def time_indexing_outer(self, key): self.ds.isel(**outer_indexes[key]).load() - time_indexing_outer.param_names = ['key'] - time_indexing_outer.params = [list(outer_indexes.keys())] - + @parameterized(['key'], (list(vectorized_indexes.keys()))) def time_indexing_vectorized(self, key): self.ds.isel(**vectorized_indexes[key]).load() - time_indexing_vectorized.param_names = ['key'] - time_indexing_vectorized.params = [list(vectorized_indexes.keys())] - class Assignment(Base): + @parameterized(['key'], (list(basic_indexes.keys()))) def time_assignment_basic(self, key): ind = basic_indexes[key] val = basic_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val - time_assignment_basic.param_names = ['key'] - time_assignment_basic.params = [list(basic_indexes.keys())] - + @parameterized(['key'], (list(outer_indexes.keys()))) def time_assignment_outer(self, key): ind = outer_indexes[key] val = outer_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val - time_assignment_outer.param_names = ['key'] - time_assignment_outer.params = [list(outer_indexes.keys())] - + @parameterized(['key'], (list(vectorized_indexes.keys()))) def time_assignment_vectorized(self, key): ind = vectorized_indexes[key] val = vectorized_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val - time_assignment_vectorized.param_names = ['key'] - time_assignment_vectorized.params = [list(vectorized_indexes.keys())] - class IndexingDask(Indexing): def setup(self, key): From 081c928bf50a284698b4a5e75b3e04fa9166aa45 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 27 Jan 2018 11:18:07 +0900 Subject: [PATCH 39/73] Working with nanreduce --- xarray/core/duck_array_ops.py | 35 ++++++++++----- xarray/core/rolling.py | 4 +- xarray/testing.py | 61 +++++++++++++++++++++++++ xarray/tests/test_duck_array_op.py | 71 ++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 15 deletions(-) create mode 100644 xarray/tests/test_duck_array_op.py diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a92190db5c8..a4f6bbf708f 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -174,7 +174,7 @@ def _ignore_warnings_if(condition): def _create_nan_agg_method(name, numeric_only=False, np_compat=False, no_bottleneck=False, coerce_strings=False, - keep_dims=False): + keep_dims=False, support_object_type=False): def f(values, axis=None, skipna=None, **kwargs): if kwargs.pop('out', None) is not None: raise TypeError('`out` is not valid for {}'.format(name)) @@ -187,7 +187,8 @@ def f(values, axis=None, skipna=None, **kwargs): values = values.astype(object) if skipna or (skipna is None and values.dtype.kind in 'cf'): - if values.dtype.kind not in ['u', 'i', 'f', 'c']: + if (not support_object_type and + values.dtype.kind not in ['u', 'i', 'f', 'c']): raise NotImplementedError( 'skipna=True not yet implemented for %s with dtype %s' % (name, values.dtype)) @@ -228,16 +229,26 @@ def f(values, axis=None, skipna=None, **kwargs): return f -argmax = _create_nan_agg_method('argmax', coerce_strings=True) -argmin = _create_nan_agg_method('argmin', coerce_strings=True) -max = _create_nan_agg_method('max', coerce_strings=True) -min = _create_nan_agg_method('min', coerce_strings=True) -sum = _create_nan_agg_method('sum', numeric_only=True) -mean = _create_nan_agg_method('mean', numeric_only=True) -std = _create_nan_agg_method('std', numeric_only=True) -var = _create_nan_agg_method('var', numeric_only=True) -median = _create_nan_agg_method('median', numeric_only=True) -prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True) +argmax = _create_nan_agg_method('argmax', coerce_strings=True, + support_object_type=True) +argmin = _create_nan_agg_method('argmin', coerce_strings=True, + support_object_type=True) +max = _create_nan_agg_method('max', coerce_strings=True, + support_object_type=True) +min = _create_nan_agg_method('min', coerce_strings=True, + support_object_type=True) +sum = _create_nan_agg_method('sum', numeric_only=True, + support_object_type=True) +mean = _create_nan_agg_method('mean', numeric_only=True, + support_object_type=True) +std = _create_nan_agg_method('std', numeric_only=True, + support_object_type=True) +var = _create_nan_agg_method('var', numeric_only=True, + support_object_type=True) +median = _create_nan_agg_method('median', numeric_only=True, + support_object_type=True) +prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True, + support_object_type=True) cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True, no_bottleneck=True, keep_dims=True) cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True, diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 9f24b78106a..31de7b01a6b 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -228,9 +228,7 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ - fill_value = dtypes.reduceable_fill_value(self.obj.dtype) - windows = self.to_dataarray('_rolling_window_dim', - fill_value=fill_value) + windows = self.to_dataarray('_rolling_window_dim') result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) # Find valid windows based on count. diff --git a/xarray/testing.py b/xarray/testing.py index f51e474405f..f15c8eb3714 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -27,6 +27,13 @@ def _data_allclose_or_equiv(arr1, arr2, rtol=1e-05, atol=1e-08, arr1, arr2, rtol=rtol, atol=atol) +def _data_allclose_or_equiv_nan(arr1, arr2, rtol=1e-05, atol=1e-08, + decode_bytes=True): + index = (~arr1.isnull()).nonzero() + assert index == (~arr2.isnull()).nonzero() + _data_allclose_or_equiv(arr1[index], arr2[index], rtol, atol, decode_bytes) + + def assert_equal(a, b): """Like :py:func:`numpy.testing.assert_array_equal`, but for xarray objects. @@ -140,3 +147,57 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): else: raise TypeError('{} not supported by assertion comparison' .format(type(a))) + + +def assert_allclose_with_nan(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): + """Like assert_allclose, but except for nan. + + Raises an AssertionError if two objects are not equal up to desired + tolerance. + + Parameters + ---------- + a : xarray.Dataset, xarray.DataArray or xarray.Variable + The first object to compare. + b : xarray.Dataset, xarray.DataArray or xarray.Variable + The second object to compare. + rtol : float, optional + Relative tolerance. + atol : float, optional + Absolute tolerance. + decode_bytes : bool, optional + Whether byte dtypes should be decoded to strings as UTF-8 or not. + This is useful for testing serialization methods on Python 3 that + return saved strings as bytes. + + See also + -------- + assert_identical, assert_equal, numpy.testing.assert_allclose + """ + import xarray as xr + # __tracebackhide__ = True # noqa: F841 + assert type(a) == type(b) + kwargs = dict(rtol=rtol, atol=atol, decode_bytes=decode_bytes) + if isinstance(a, xr.Variable): + assert a.dims == b.dims + allclose = _data_allclose_or_equiv_nan(a.values, b.values, **kwargs) + assert allclose, '{}\n{}'.format(a.values, b.values) + elif isinstance(a, xr.DataArray): + assert_allclose(a.variable, b.variable, **kwargs) + assert set(a.coords) == set(b.coords) + for v in a.coords.variables: + # can't recurse with this function as coord is sometimes a + # DataArray, so call into _data_allclose_or_equiv directly + allclose = _data_allclose_or_equiv_nan( + a.coords[v].values, b.coords[v].values, **kwargs) + assert allclose, '{}\n{}'.format(a.coords[v].values, + b.coords[v].values) + elif isinstance(a, xr.Dataset): + assert set(a.data_vars) == set(b.data_vars) + assert set(a.coords) == set(b.coords) + for k in list(a.variables) + list(a.coords): + assert_allclose_with_nan(a[k], b[k], **kwargs) + + else: + raise TypeError('{} not supported by assertion comparison' + .format(type(a))) diff --git a/xarray/tests/test_duck_array_op.py b/xarray/tests/test_duck_array_op.py new file mode 100644 index 00000000000..9082de0c4f0 --- /dev/null +++ b/xarray/tests/test_duck_array_op.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import pandas as pd +import pickle +import pytest +from copy import deepcopy +from textwrap import dedent +from distutils.version import LooseVersion + +import xarray as xr + +from xarray import (align, broadcast, Dataset, DataArray, + IndexVariable, Variable) +from xarray.coding.times import CFDatetimeCoder +from xarray.core.pycompat import iteritems, OrderedDict +from xarray.core.common import full_like +from xarray.tests import ( + TestCase, ReturnItem, source_ndarray, unittest, requires_dask, + assert_identical, assert_equal, assert_allclose, assert_array_equal, + raises_regex, requires_scipy, requires_bottleneck) +from xarray.core.ops import NAN_REDUCE_METHODS + + +def construct_dataarray(dtype, contains_nan, dask): + da = DataArray(np.random.randn(15, 30), dims=('x', 'y'), + coords={'x': np.arange(15)}).astype(dtype) + + if contains_nan: + da = da.reindex(x=np.arange(20)) + if dask: + da = da.chunk({'x': 5, 'y': 10}) + + return da + + +def assert_allclose_with_nan(a, b, **kwargs): + """ Extension of np.allclose with nan-including array """ + index = ~np.isnan(a) + print(a) + print(b) + assert index == ~np.isnan(b) + assert np.allclose(a[index], b[index], **kwargs) + + +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('contains_nan', [False, True]) +@pytest.mark.parametrize('dask', [False, ]) +@pytest.mark.parametrize('func', NAN_REDUCE_METHODS) +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('dim', [None, 'x', 'y']) +def test_reduce(dtype, contains_nan, dask, func, skipna, dim): + if dask: # TODO some reduce methods are not available for dask + if func in ['sum']: + return + + da = construct_dataarray(dtype, contains_nan, dask) + + if skipna: + try: # TODO currently, we only support methods that numpy supports + expected = getattr(np, 'nan{}'.format(func))(da.values) + except TypeError: + with pytest.raises(NotImplementedError): + actual = getattr(da, func)(skipna=skipna) + return + else: + expected = getattr(np, func)(da.values) + + actual = getattr(da, func)(skipna=skipna) + assert_allclose_with_nan(actual.values, np.array(expected)) From 75c1d7d7fec6f5abaaf21037f4362cb7921552b2 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 30 Jan 2018 11:21:50 +0900 Subject: [PATCH 40/73] Support .sum for object dtype. --- xarray/core/dtypes.py | 10 ----- xarray/core/duck_array_ops.py | 49 ++++++++++++++----------- xarray/tests/test_duck_array_ops.py | 57 +++++++++++++++++++++++++++-- xarray/tests/test_variable.py | 2 +- 4 files changed, 83 insertions(+), 35 deletions(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 2a7f9ccf0ce..ccbe48edc32 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -60,13 +60,3 @@ def is_datetime_like(dtype): """ return (np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)) - - -def reduceable_fill_value(dtype): - """ Fill value that can be calculated with dtype. """ - if dtype.kind == 'b': - return False - promoted_dtype, _ = maybe_promote(dtype) - if promoted_dtype == object: - raise TypeError('dtype {} is not reduceable.'.format(dtype)) - return NA diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a4f6bbf708f..89a3517f3e7 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -183,17 +183,30 @@ def f(values, axis=None, skipna=None, **kwargs): dtype = kwargs.get('dtype', None) values = asarray(values) + # dask requires dtype argument for object dtype + if (values.dtype == 'object' and name in ['sum', 'mean']): + kwargs['dtype'] = values.dtype if dtype is None else dtype + + # dask can't compute std for object dtype with skipna==False + if values.dtype == 'object' and name in ['std', 'var'] and not skipna: + raise NotImplementedError( + '%s for %s-dtype is not yet implemented on dask arrays' + % (name, values.dtype)) + if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) if skipna or (skipna is None and values.dtype.kind in 'cf'): - if (not support_object_type and - values.dtype.kind not in ['u', 'i', 'f', 'c']): - raise NotImplementedError( - 'skipna=True not yet implemented for %s with dtype %s' - % (name, values.dtype)) nanname = 'nan' + name - if (isinstance(axis, tuple) or not values.dtype.isnative or + if values.dtype.kind not in ['u', 'i', 'f', 'c']: + + if not support_object_type: + raise NotImplementedError( + 'skipna=True not yet implemented for %s with dtype %s' + % (name, values.dtype)) + eager_module = np + + elif (isinstance(axis, tuple) or not values.dtype.isnative or no_bottleneck or (dtype is not None and np.dtype(dtype) != values.dtype)): # bottleneck can't handle multiple axis arguments or non-native @@ -214,7 +227,8 @@ def f(values, axis=None, skipna=None, **kwargs): with _ignore_warnings_if(using_numpy_nan_func): try: return func(values, axis=axis, **kwargs) - except AttributeError: + except AttributeError as e: + print(e) if isinstance(values, dask_array_type): msg = '%s is not yet implemented on dask arrays' % name else: @@ -229,26 +243,19 @@ def f(values, axis=None, skipna=None, **kwargs): return f -argmax = _create_nan_agg_method('argmax', coerce_strings=True, - support_object_type=True) -argmin = _create_nan_agg_method('argmin', coerce_strings=True, - support_object_type=True) +argmax = _create_nan_agg_method('argmax', coerce_strings=True) +argmin = _create_nan_agg_method('argmin', coerce_strings=True) max = _create_nan_agg_method('max', coerce_strings=True, support_object_type=True) min = _create_nan_agg_method('min', coerce_strings=True, support_object_type=True) sum = _create_nan_agg_method('sum', numeric_only=True, support_object_type=True) -mean = _create_nan_agg_method('mean', numeric_only=True, - support_object_type=True) -std = _create_nan_agg_method('std', numeric_only=True, - support_object_type=True) -var = _create_nan_agg_method('var', numeric_only=True, - support_object_type=True) -median = _create_nan_agg_method('median', numeric_only=True, - support_object_type=True) -prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True, - support_object_type=True) +mean = _create_nan_agg_method('mean', numeric_only=True) +std = _create_nan_agg_method('std', numeric_only=True) +var = _create_nan_agg_method('var', numeric_only=True) +median = _create_nan_agg_method('median', numeric_only=True) +prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True) cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True, no_bottleneck=True, keep_dims=True) cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 9fb1b1aad40..c51c67dd13a 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,15 +1,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from pytest import mark +import pytest import numpy as np from numpy import array, nan from . import assert_array_equal from xarray.core.duck_array_ops import ( first, last, count, mean, array_notnull_equiv, ) +from xarray.core.ops import NAN_REDUCE_METHODS +from xarray import DataArray from . import TestCase, raises_regex +from xarray.testing import assert_allclose class TestOps(TestCase): @@ -81,7 +84,7 @@ def test_all_nan_arrays(self): class TestArrayNotNullEquiv(): - @mark.parametrize("arr1, arr2", [ + @pytest.mark.parametrize("arr1, arr2", [ (np.array([1, 2, 3]), np.array([1, 2, 3])), (np.array([1, 2, np.nan]), np.array([1, np.nan, 3])), (np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])), @@ -99,7 +102,7 @@ def test_wrong_shape(self): b = np.array([[1, 2], [np.nan, 4]]) assert not array_notnull_equiv(a, b) - @mark.parametrize("val1, val2, val3, null", [ + @pytest.mark.parametrize("val1, val2, val3, null", [ (1, 2, 3, None), (1., 2., 3., np.nan), (1., 2., 3., None), @@ -109,3 +112,51 @@ def test_types(self, val1, val2, val3, null): arr1 = np.array([val1, null, val3, null]) arr2 = np.array([val1, val2, null, null]) assert array_notnull_equiv(arr1, arr2) + + +def construct_dataarray(dtype, contains_nan, dask): + da = DataArray(np.random.randn(15, 30), dims=('x', 'y'), + coords={'x': np.arange(15)}).astype(dtype) + + if contains_nan: + da = da.reindex(x=np.arange(20)) + if dask: + da = da.chunk({'x': 5, 'y': 10}) + + return da + + +def assert_allclose_with_nan(a, b, **kwargs): + """ Extension of np.allclose with nan-including array """ + index = ~np.isnan(a) + assert index == ~np.isnan(b) + assert np.allclose(a[index], b[index], **kwargs) + + +@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) +@pytest.mark.parametrize('dask', [False, True]) +@pytest.mark.parametrize('func', ['sum', 'min', 'max']) # TODO support more +@pytest.mark.parametrize('skipna', [False, True]) +@pytest.mark.parametrize('dim', [None, 'x', ('x', 'y')]) +def test_reduce(dtype, dask, func, skipna, dim): + + da = construct_dataarray(dtype, contains_nan=True, dask=dask) + + if skipna: + try: # TODO currently, we only support methods that numpy supports + expected = getattr(np, 'nan{}'.format(func))(da.values) + except (TypeError, AttributeError): + with pytest.raises(NotImplementedError): + actual = getattr(da, func)(skipna=skipna) + return + else: + expected = getattr(np, func)(da.values) + + actual = getattr(da, func)(skipna=skipna) + assert_allclose_with_nan(actual.values, np.array(expected)) + + # without nan + da = construct_dataarray(dtype, contains_nan=False, dask=dask) + expected = getattr(np, 'nan{}'.format(func))(da.values) + actual = getattr(da, func)(skipna=skipna) + assert np.allclose(actual.values, np.array(expected), atol=1.0e-10) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 0c7c6ad034b..8507d434de4 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1496,7 +1496,7 @@ def test_reduce_funcs(self): v = Variable('t', pd.date_range('2000-01-01', periods=3)) with pytest.raises(NotImplementedError): - v.max(skipna=True) + v.argmax(skipna=True) assert_identical( v.max(), Variable([], pd.Timestamp('2000-01-03'))) From 452b219c7544d55243343e40d72fa7e27508df1d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Tue, 30 Jan 2018 11:26:30 +0900 Subject: [PATCH 41/73] Remove unused if-statements. --- xarray/core/duck_array_ops.py | 8 +------- xarray/tests/test_duck_array_ops.py | 3 ++- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 89a3517f3e7..4b719e2216e 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -184,15 +184,9 @@ def f(values, axis=None, skipna=None, **kwargs): values = asarray(values) # dask requires dtype argument for object dtype - if (values.dtype == 'object' and name in ['sum', 'mean']): + if (values.dtype == 'object' and name in ['sum',]): kwargs['dtype'] = values.dtype if dtype is None else dtype - # dask can't compute std for object dtype with skipna==False - if values.dtype == 'object' and name in ['std', 'var'] and not skipna: - raise NotImplementedError( - '%s for %s-dtype is not yet implemented on dask arrays' - % (name, values.dtype)) - if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index c51c67dd13a..b551ada0ea4 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -115,7 +115,8 @@ def test_types(self, val1, val2, val3, null): def construct_dataarray(dtype, contains_nan, dask): - da = DataArray(np.random.randn(15, 30), dims=('x', 'y'), + rng = np.random.RandomState(0) + da = DataArray(rng.randn(15, 30), dims=('x', 'y'), coords={'x': np.arange(15)}).astype(dtype) if contains_nan: From c5490c4dd9ec6f8a87056c3be8980dd14034afa6 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 30 Jan 2018 13:49:32 +0900 Subject: [PATCH 42/73] Default skipna for rolling.reduce --- xarray/core/duck_array_ops.py | 1 - xarray/core/rolling.py | 3 ++- xarray/tests/test_dataarray.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 4b719e2216e..96008aa0df0 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -222,7 +222,6 @@ def f(values, axis=None, skipna=None, **kwargs): try: return func(values, axis=axis, **kwargs) except AttributeError as e: - print(e) if isinstance(values, dask_array_type): msg = '%s is not yet implemented on dask arrays' % name else: diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 31de7b01a6b..4b1b106af23 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -251,7 +251,8 @@ def _reduce_method(cls, func): """ def wrapped_func(self, **kwargs): - return self.reduce(func, **kwargs) + skipna = kwargs.pop('skipna', True) + return self.reduce(func, skipna=skipna, **kwargs) return wrapped_func @classmethod diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b34662bd2ca..54f4918563b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3479,7 +3479,7 @@ def test_rolling_reduce_nonnumeric(center, min_periods, window, name): rolling_obj = da.rolling(time=window, center=center, min_periods=min_periods) - # add nan prefix to numpy methods to get similar # behavior as bottleneck + # add nan prefix to numpy methods to get similar behavior as bottleneck actual = rolling_obj.reduce(getattr(np, 'nan%s' % name)) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) From ab913945a342f33e2cc4edeb9093b60fa8984ac9 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 30 Jan 2018 17:07:36 +0900 Subject: [PATCH 43/73] Pass tests. Test added to make sure the consistency to pandas' behavior. --- xarray/core/duck_array_ops.py | 6 ++++-- xarray/core/rolling.py | 3 +-- xarray/tests/test_duck_array_ops.py | 26 ++++++++++++++++---------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 96008aa0df0..8e3aa985047 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -184,13 +184,15 @@ def f(values, axis=None, skipna=None, **kwargs): values = asarray(values) # dask requires dtype argument for object dtype - if (values.dtype == 'object' and name in ['sum',]): + if (values.dtype == 'object' and name in ['sum', ]): kwargs['dtype'] = values.dtype if dtype is None else dtype if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) - if skipna or (skipna is None and values.dtype.kind in 'cf'): + if (skipna or (skipna is None and values.dtype.kind in 'cf') or + (skipna is None and values.dtype == object + and support_object_type)): nanname = 'nan' + name if values.dtype.kind not in ['u', 'i', 'f', 'c']: diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 4b1b106af23..31de7b01a6b 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -251,8 +251,7 @@ def _reduce_method(cls, func): """ def wrapped_func(self, **kwargs): - skipna = kwargs.pop('skipna', True) - return self.reduce(func, skipna=skipna, **kwargs) + return self.reduce(func, **kwargs) return wrapped_func @classmethod diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index b551ada0ea4..cea985fbaf8 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -8,11 +8,9 @@ from xarray.core.duck_array_ops import ( first, last, count, mean, array_notnull_equiv, ) -from xarray.core.ops import NAN_REDUCE_METHODS from xarray import DataArray from . import TestCase, raises_regex -from xarray.testing import assert_allclose class TestOps(TestCase): @@ -117,7 +115,7 @@ def test_types(self, val1, val2, val3, null): def construct_dataarray(dtype, contains_nan, dask): rng = np.random.RandomState(0) da = DataArray(rng.randn(15, 30), dims=('x', 'y'), - coords={'x': np.arange(15)}).astype(dtype) + coords={'x': np.arange(15)}, name='da').astype(dtype) if contains_nan: da = da.reindex(x=np.arange(20)) @@ -129,31 +127,39 @@ def construct_dataarray(dtype, contains_nan, dask): def assert_allclose_with_nan(a, b, **kwargs): """ Extension of np.allclose with nan-including array """ - index = ~np.isnan(a) - assert index == ~np.isnan(b) - assert np.allclose(a[index], b[index], **kwargs) + for a1, b1 in zip(a.ravel(), b.ravel()): + assert (np.isnan(a1) and np.isnan(b1)) or np.allclose(a1, b1, + **kwargs) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) @pytest.mark.parametrize('func', ['sum', 'min', 'max']) # TODO support more @pytest.mark.parametrize('skipna', [False, True]) -@pytest.mark.parametrize('dim', [None, 'x', ('x', 'y')]) +@pytest.mark.parametrize('dim', [None, 'x', 'y']) def test_reduce(dtype, dask, func, skipna, dim): da = construct_dataarray(dtype, contains_nan=True, dask=dask) + axis = None if dim is None else da.get_axis_num(dim) if skipna: try: # TODO currently, we only support methods that numpy supports - expected = getattr(np, 'nan{}'.format(func))(da.values) + expected = getattr(np, 'nan{}'.format(func))(da.values, + axis=axis) except (TypeError, AttributeError): with pytest.raises(NotImplementedError): - actual = getattr(da, func)(skipna=skipna) + actual = getattr(da, func)(skipna=skipna, dim=dim) return else: - expected = getattr(np, func)(da.values) + expected = getattr(np, func)(da.values, axis=axis) + actual = getattr(da, func)(skipna=skipna, dim=dim) + assert_allclose_with_nan(actual.values, np.array(expected)) + + # compatible with pandas + se = da.to_dataframe() actual = getattr(da, func)(skipna=skipna) + expected = getattr(se, func)(skipna=skipna) assert_allclose_with_nan(actual.values, np.array(expected)) # without nan From 9fa0812efb6b6bedfc4e9b15698b402534f0adec Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 30 Jan 2018 17:33:29 +0900 Subject: [PATCH 44/73] Delete duplicate file. flake8 --- xarray/core/duck_array_ops.py | 4 +- xarray/tests/test_duck_array_op.py | 71 ------------------------------ 2 files changed, 2 insertions(+), 73 deletions(-) delete mode 100644 xarray/tests/test_duck_array_op.py diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 8e3aa985047..73e684c91ae 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -191,8 +191,8 @@ def f(values, axis=None, skipna=None, **kwargs): values = values.astype(object) if (skipna or (skipna is None and values.dtype.kind in 'cf') or - (skipna is None and values.dtype == object - and support_object_type)): + (skipna is None and values.dtype == object and + support_object_type)): nanname = 'nan' + name if values.dtype.kind not in ['u', 'i', 'f', 'c']: diff --git a/xarray/tests/test_duck_array_op.py b/xarray/tests/test_duck_array_op.py deleted file mode 100644 index 9082de0c4f0..00000000000 --- a/xarray/tests/test_duck_array_op.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import numpy as np -import pandas as pd -import pickle -import pytest -from copy import deepcopy -from textwrap import dedent -from distutils.version import LooseVersion - -import xarray as xr - -from xarray import (align, broadcast, Dataset, DataArray, - IndexVariable, Variable) -from xarray.coding.times import CFDatetimeCoder -from xarray.core.pycompat import iteritems, OrderedDict -from xarray.core.common import full_like -from xarray.tests import ( - TestCase, ReturnItem, source_ndarray, unittest, requires_dask, - assert_identical, assert_equal, assert_allclose, assert_array_equal, - raises_regex, requires_scipy, requires_bottleneck) -from xarray.core.ops import NAN_REDUCE_METHODS - - -def construct_dataarray(dtype, contains_nan, dask): - da = DataArray(np.random.randn(15, 30), dims=('x', 'y'), - coords={'x': np.arange(15)}).astype(dtype) - - if contains_nan: - da = da.reindex(x=np.arange(20)) - if dask: - da = da.chunk({'x': 5, 'y': 10}) - - return da - - -def assert_allclose_with_nan(a, b, **kwargs): - """ Extension of np.allclose with nan-including array """ - index = ~np.isnan(a) - print(a) - print(b) - assert index == ~np.isnan(b) - assert np.allclose(a[index], b[index], **kwargs) - - -@pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) -@pytest.mark.parametrize('contains_nan', [False, True]) -@pytest.mark.parametrize('dask', [False, ]) -@pytest.mark.parametrize('func', NAN_REDUCE_METHODS) -@pytest.mark.parametrize('skipna', [False, True]) -@pytest.mark.parametrize('dim', [None, 'x', 'y']) -def test_reduce(dtype, contains_nan, dask, func, skipna, dim): - if dask: # TODO some reduce methods are not available for dask - if func in ['sum']: - return - - da = construct_dataarray(dtype, contains_nan, dask) - - if skipna: - try: # TODO currently, we only support methods that numpy supports - expected = getattr(np, 'nan{}'.format(func))(da.values) - except TypeError: - with pytest.raises(NotImplementedError): - actual = getattr(da, func)(skipna=skipna) - return - else: - expected = getattr(np, func)(da.values) - - actual = getattr(da, func)(skipna=skipna) - assert_allclose_with_nan(actual.values, np.array(expected)) From 0c1d49a3d1fcb113208500522e1357efb1f643ff Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 30 Jan 2018 17:53:39 +0900 Subject: [PATCH 45/73] flake8 again --- xarray/core/duck_array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 73e684c91ae..a07ed62badd 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -223,7 +223,7 @@ def f(values, axis=None, skipna=None, **kwargs): with _ignore_warnings_if(using_numpy_nan_func): try: return func(values, axis=axis, **kwargs) - except AttributeError as e: + except AttributeError: if isinstance(values, dask_array_type): msg = '%s is not yet implemented on dask arrays' % name else: From 9463937a6fa92aeaacde4d635dab30d03a0bf00b Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Tue, 30 Jan 2018 20:17:07 +0900 Subject: [PATCH 46/73] Working with numpy<1.13 --- xarray/core/duck_array_ops.py | 8 +- xarray/core/npcompat.py | 316 +++++++++++++++++++++++++++- xarray/tests/test_dataarray.py | 3 +- xarray/tests/test_duck_array_ops.py | 16 +- 4 files changed, 331 insertions(+), 12 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a07ed62badd..dddcfa5d340 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -200,7 +200,7 @@ def f(values, axis=None, skipna=None, **kwargs): raise NotImplementedError( 'skipna=True not yet implemented for %s with dtype %s' % (name, values.dtype)) - eager_module = np + eager_module = npcompat if np_compat else np elif (isinstance(axis, tuple) or not values.dtype.isnative or no_bottleneck or @@ -240,11 +240,11 @@ def f(values, axis=None, skipna=None, **kwargs): argmax = _create_nan_agg_method('argmax', coerce_strings=True) argmin = _create_nan_agg_method('argmin', coerce_strings=True) -max = _create_nan_agg_method('max', coerce_strings=True, +max = _create_nan_agg_method('max', coerce_strings=True, np_compat=True, support_object_type=True) -min = _create_nan_agg_method('min', coerce_strings=True, +min = _create_nan_agg_method('min', coerce_strings=True, np_compat=True, support_object_type=True) -sum = _create_nan_agg_method('sum', numeric_only=True, +sum = _create_nan_agg_method('sum', numeric_only=True, np_compat=True, support_object_type=True) mean = _create_nan_agg_method('mean', numeric_only=True) std = _create_nan_agg_method('std', numeric_only=True) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 576241feea3..bb35aea86eb 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -5,14 +5,326 @@ from distutils.version import LooseVersion -if LooseVersion(np.__version__) < LooseVersion('1.12'): +if LooseVersion(np.__version__) >= LooseVersion('1.12'): + as_strided = np.lib.stride_tricks.as_strided +else: def as_strided(x, shape=None, strides=None, subok=False, writeable=True): array = np.lib.stride_tricks.as_strided(x, shape, strides, subok) array.setflags(write=writeable) return array + +if LooseVersion(np.__version__) >= LooseVersion('1.13'): + nanmin = np.nanmin + nanmax = np.nanmax + nansum = np.nansum + else: - as_strided = np.lib.stride_tricks.as_strided + def _replace_nan(a, val): + """ + If `a` is of inexact type, make a copy of `a`, replace NaNs with + the `val` value, and return the copy together with a boolean mask + marking the locations where NaNs were present. If `a` is not of + inexact type, do nothing and return `a` together with a mask of None. + Note that scalars will end up as array scalars, which is important + for using the result as the value of the out argument in some + operations. + Parameters + ---------- + a : array-like + Input array. + val : float + NaN values are set to val before doing the operation. + Returns + ------- + y : ndarray + If `a` is of inexact type, return a copy of `a` with the NaNs + replaced by the fill value, otherwise return `a`. + mask: {bool, None} + If `a` is of inexact type, return a boolean mask marking locations of + NaNs, otherwise return None. + """ + a = np.array(a, subok=True, copy=True) + + if a.dtype == np.object_: + # object arrays do not support `isnan` (gh-9009), so make a guess + mask = a != a + elif issubclass(a.dtype.type, np.inexact): + mask = np.isnan(a) + else: + mask = None + + if mask is not None: + np.copyto(a, val, where=mask) + + return a, mask + + + def nanmin(a, axis=None, out=None, keepdims=np._NoValue): + """ + Return minimum of an array or minimum along an axis, ignoring any NaNs. + When all-NaN slices are encountered a ``RuntimeWarning`` is raised and + Nan is returned for that slice. + Parameters + ---------- + a : array_like + Array containing numbers whose minimum is desired. If `a` is not an + array, a conversion is attempted. + axis : int, optional + Axis along which the minimum is computed. The default is to compute + the minimum of the flattened array. + out : ndarray, optional + Alternate output array in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. + .. versionadded:: 1.8.0 + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + If the value is anything but the default, then + `keepdims` will be passed through to the `min` method + of sub-classes of `ndarray`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + .. versionadded:: 1.8.0 + Returns + ------- + nanmin : ndarray + An array with the same shape as `a`, with the specified axis + removed. If `a` is a 0-d array, or if axis is None, an ndarray + scalar is returned. The same dtype as `a` is returned. + See Also + -------- + nanmax : + The maximum value of an array along a given axis, ignoring any NaNs. + amin : + The minimum value of an array along a given axis, propagating any NaNs. + fmin : + Element-wise minimum of two arrays, ignoring any NaNs. + minimum : + Element-wise minimum of two arrays, propagating any NaNs. + isnan : + Shows which elements are Not a Number (NaN). + isfinite: + Shows which elements are neither NaN nor infinity. + amax, fmax, maximum + Notes + ----- + NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. + If the input has a integer type the function is equivalent to np.min. + Examples + -------- + >>> a = np.array([[1, 2], [3, np.nan]]) + >>> np.nanmin(a) + 1.0 + >>> np.nanmin(a, axis=0) + array([ 1., 2.]) + >>> np.nanmin(a, axis=1) + array([ 1., 3.]) + When positive infinity and negative infinity are present: + >>> np.nanmin([1, 2, np.nan, np.inf]) + 1.0 + >>> np.nanmin([1, 2, np.nan, np.NINF]) + -inf + """ + kwargs = {} + if keepdims is not np._NoValue: + kwargs['keepdims'] = keepdims + if type(a) is np.ndarray and a.dtype != np.object_: + # Fast, but not safe for subclasses of ndarray, or object arrays, + # which do not implement isnan (gh-9009), or fmin correctly (gh-8975) + res = np.fmin.reduce(a, axis=axis, out=out, **kwargs) + if np.isnan(res).any(): + warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=2) + else: + # Slow, but safe for subclasses of ndarray + a, mask = _replace_nan(a, +np.inf) + res = np.amin(a, axis=axis, out=out, **kwargs) + if mask is None: + return res + + # Check for all-NaN axis + mask = np.all(mask, axis=axis, **kwargs) + if np.any(mask): + res = _copyto(res, np.nan, mask) + warnings.warn("All-NaN axis encountered", RuntimeWarning, stacklevel=2) + return res + + + def nanmax(a, axis=None, out=None, keepdims=np._NoValue): + """ + Return the maximum of an array or maximum along an axis, ignoring any + NaNs. When all-NaN slices are encountered a ``RuntimeWarning`` is + raised and NaN is returned for that slice. + Parameters + ---------- + a : array_like + Array containing numbers whose maximum is desired. If `a` is not an + array, a conversion is attempted. + axis : int, optional + Axis along which the maximum is computed. The default is to compute + the maximum of the flattened array. + out : ndarray, optional + Alternate output array in which to place the result. The default + is ``None``; if provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. + .. versionadded:: 1.8.0 + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + If the value is anything but the default, then + `keepdims` will be passed through to the `max` method + of sub-classes of `ndarray`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + .. versionadded:: 1.8.0 + Returns + ------- + nanmax : ndarray + An array with the same shape as `a`, with the specified axis removed. + If `a` is a 0-d array, or if axis is None, an ndarray scalar is + returned. The same dtype as `a` is returned. + See Also + -------- + nanmin : + The minimum value of an array along a given axis, ignoring any NaNs. + amax : + The maximum value of an array along a given axis, propagating any NaNs. + fmax : + Element-wise maximum of two arrays, ignoring any NaNs. + maximum : + Element-wise maximum of two arrays, propagating any NaNs. + isnan : + Shows which elements are Not a Number (NaN). + isfinite: + Shows which elements are neither NaN nor infinity. + amin, fmin, minimum + Notes + ----- + NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic + (IEEE 754). This means that Not a Number is not equivalent to infinity. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. + If the input has a integer type the function is equivalent to np.max. + Examples + -------- + >>> a = np.array([[1, 2], [3, np.nan]]) + >>> np.nanmax(a) + 3.0 + >>> np.nanmax(a, axis=0) + array([ 3., 2.]) + >>> np.nanmax(a, axis=1) + array([ 2., 3.]) + When positive infinity and negative infinity are present: + >>> np.nanmax([1, 2, np.nan, np.NINF]) + 2.0 + >>> np.nanmax([1, 2, np.nan, np.inf]) + inf + """ + kwargs = {} + if keepdims is not np._NoValue: + kwargs['keepdims'] = keepdims + if type(a) is np.ndarray and a.dtype != np.object_: + # Fast, but not safe for subclasses of ndarray, or object arrays, + # which do not implement isnan (gh-9009), or fmax correctly (gh-8975) + res = np.fmax.reduce(a, axis=axis, out=out, **kwargs) + if np.isnan(res).any(): + warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=2) + else: + # Slow, but safe for subclasses of ndarray + a, mask = _replace_nan(a, -np.inf) + res = np.amax(a, axis=axis, out=out, **kwargs) + if mask is None: + return res + + # Check for all-NaN axis + mask = np.all(mask, axis=axis, **kwargs) + if np.any(mask): + res = _copyto(res, np.nan, mask) + warnings.warn("All-NaN axis encountered", RuntimeWarning, stacklevel=2) + return res + + def nansum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): + """ + Return the sum of array elements over a given axis treating Not a + Numbers (NaNs) as zero. + In NumPy versions <= 1.8.0 Nan is returned for slices that are all-NaN or + empty. In later versions zero is returned. + Parameters + ---------- + a : array_like + Array containing numbers whose sum is desired. If `a` is not an + array, a conversion is attempted. + axis : int, optional + Axis along which the sum is computed. The default is to compute the + sum of the flattened array. + dtype : data-type, optional + The type of the returned array and of the accumulator in which the + elements are summed. By default, the dtype of `a` is used. An + exception is when `a` has an integer type with less precision than + the platform (u)intp. In that case, the default will be either + (u)int32 or (u)int64 depending on whether the platform is 32 or 64 + bits. For inexact inputs, dtype must be inexact. + .. versionadded:: 1.8.0 + out : ndarray, optional + Alternate output array in which to place the result. The default + is ``None``. If provided, it must have the same shape as the + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. The casting of NaN to integer can yield + unexpected results. + .. versionadded:: 1.8.0 + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the original `a`. + If the value is anything but the default, then + `keepdims` will be passed through to the `mean` or `sum` methods + of sub-classes of `ndarray`. If the sub-classes methods + does not implement `keepdims` any exceptions will be raised. + .. versionadded:: 1.8.0 + Returns + ------- + nansum : ndarray. + A new array holding the result is returned unless `out` is + specified, in which it is returned. The result has the same + size as `a`, and the same shape as `a` if `axis` is not None + or `a` is a 1-d array. + See Also + -------- + numpy.sum : Sum across array propagating NaNs. + isnan : Show which elements are NaN. + isfinite: Show which elements are not NaN or +/-inf. + Notes + ----- + If both positive and negative infinity are present, the sum will be Not + A Number (NaN). + Examples + -------- + >>> np.nansum(1) + 1 + >>> np.nansum([1]) + 1 + >>> np.nansum([1, np.nan]) + 1.0 + >>> a = np.array([[1, 1], [1, np.nan]]) + >>> np.nansum(a) + 3.0 + >>> np.nansum(a, axis=0) + array([ 2., 1.]) + >>> np.nansum([1, np.nan, np.inf]) + inf + >>> np.nansum([1, np.nan, np.NINF]) + -inf + >>> np.nansum([1, np.nan, np.inf, -np.inf]) # both +/- infinity present + nan + """ + a, mask = _replace_nan(a, 0) + return np.sum(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims) try: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 54f4918563b..94736fb0def 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -16,6 +16,7 @@ from xarray.coding.times import CFDatetimeCoder from xarray.core.pycompat import iteritems, OrderedDict from xarray.core.common import full_like +from xarray.core import npcompat from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, @@ -3480,7 +3481,7 @@ def test_rolling_reduce_nonnumeric(center, min_periods, window, name): min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(np, 'nan%s' % name)) + actual = rolling_obj.reduce(getattr(npcompat, 'nan%s' % name)) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.dims == expected.dims diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index cea985fbaf8..a5de44cc403 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -9,8 +9,9 @@ first, last, count, mean, array_notnull_equiv, ) from xarray import DataArray +from xarray.core import npcompat -from . import TestCase, raises_regex +from . import TestCase, raises_regex, has_dask class TestOps(TestCase): @@ -119,7 +120,7 @@ def construct_dataarray(dtype, contains_nan, dask): if contains_nan: da = da.reindex(x=np.arange(20)) - if dask: + if dask and has_dask: da = da.chunk({'x': 5, 'y': 10}) return da @@ -142,10 +143,13 @@ def test_reduce(dtype, dask, func, skipna, dim): da = construct_dataarray(dtype, contains_nan=True, dask=dask) axis = None if dim is None else da.get_axis_num(dim) + if dask and not has_dask: + return + if skipna: try: # TODO currently, we only support methods that numpy supports - expected = getattr(np, 'nan{}'.format(func))(da.values, - axis=axis) + expected = getattr(npcompat, 'nan{}'.format(func))(da.values, + axis=axis) except (TypeError, AttributeError): with pytest.raises(NotImplementedError): actual = getattr(da, func)(skipna=skipna, dim=dim) @@ -158,6 +162,8 @@ def test_reduce(dtype, dask, func, skipna, dim): # compatible with pandas se = da.to_dataframe() + print(da) + print(da.reduce(npcompat.nansum)) actual = getattr(da, func)(skipna=skipna) expected = getattr(se, func)(skipna=skipna) assert_allclose_with_nan(actual.values, np.array(expected)) @@ -166,4 +172,4 @@ def test_reduce(dtype, dask, func, skipna, dim): da = construct_dataarray(dtype, contains_nan=False, dask=dask) expected = getattr(np, 'nan{}'.format(func))(da.values) actual = getattr(da, func)(skipna=skipna) - assert np.allclose(actual.values, np.array(expected), atol=1.0e-10) + assert np.allclose(actual.values, np.array(expected)) From dce4e37295a6fdb7e0c3eba541c630070118606e Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 10 Feb 2018 18:37:59 +0900 Subject: [PATCH 47/73] Revert "Classize benchmarks/indexing.py" This reverts commit 4189d71d9998ff83deb9a5d7035a2edaf628ae25. --- asv_bench/benchmarks/__init__.py | 1 + asv_bench/benchmarks/indexing.py | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index dd00ffbb037..d84eda387be 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -2,6 +2,7 @@ from __future__ import division from __future__ import print_function import itertools +import random import numpy as np diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 0887fb52f57..e9a85115a49 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -6,7 +6,7 @@ import pandas as pd import xarray as xr -from . import parameterized, randn, randint, requires_dask +from . import randn, randint, requires_dask nx = 3000 @@ -61,7 +61,7 @@ class Base(object): - def setup(self, *args, **kwargs): + def setup(self, key): self.ds = xr.Dataset( {'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), 'var2': (('x', 't'), randn((nx, nt))), @@ -73,41 +73,53 @@ def setup(self, *args, **kwargs): class Indexing(Base): - @parameterized(['key'], (list(basic_indexes.keys()))) def time_indexing_basic(self, key): self.ds.isel(**basic_indexes[key]).load() - @parameterized(['key'], (list(outer_indexes.keys()))) + time_indexing_basic.param_names = ['key'] + time_indexing_basic.params = [list(basic_indexes.keys())] + def time_indexing_outer(self, key): self.ds.isel(**outer_indexes[key]).load() - @parameterized(['key'], (list(vectorized_indexes.keys()))) + time_indexing_outer.param_names = ['key'] + time_indexing_outer.params = [list(outer_indexes.keys())] + def time_indexing_vectorized(self, key): self.ds.isel(**vectorized_indexes[key]).load() + time_indexing_vectorized.param_names = ['key'] + time_indexing_vectorized.params = [list(vectorized_indexes.keys())] + class Assignment(Base): - @parameterized(['key'], (list(basic_indexes.keys()))) def time_assignment_basic(self, key): ind = basic_indexes[key] val = basic_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val - @parameterized(['key'], (list(outer_indexes.keys()))) + time_assignment_basic.param_names = ['key'] + time_assignment_basic.params = [list(basic_indexes.keys())] + def time_assignment_outer(self, key): ind = outer_indexes[key] val = outer_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val - @parameterized(['key'], (list(vectorized_indexes.keys()))) + time_assignment_outer.param_names = ['key'] + time_assignment_outer.params = [list(outer_indexes.keys())] + def time_assignment_vectorized(self, key): ind = vectorized_indexes[key] val = vectorized_assignment_values[key] self.ds['var1'][ind.get('x', slice(None)), ind.get('y', slice(None))] = val + time_assignment_vectorized.param_names = ['key'] + time_assignment_vectorized.params = [list(vectorized_indexes.keys())] + class IndexingDask(Indexing): def setup(self, key): From b3050cb98919584c3a89cd17a4e062eb91a0c6a2 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sat, 10 Feb 2018 23:53:10 +0900 Subject: [PATCH 48/73] rolling_window with dask.ghost --- xarray/core/dask_array_ops.py | 41 +++++++++++++++++++++++++++++ xarray/core/duck_array_ops.py | 13 ++------- xarray/core/variable.py | 17 +++++++----- xarray/tests/test_duck_array_ops.py | 24 ++++++++++++++--- xarray/tests/test_nputils.py | 1 + xarray/tests/test_variable.py | 1 + 6 files changed, 77 insertions(+), 20 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 3aefd114517..c8ea36f2818 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,6 +1,7 @@ """Define core operations for xarray objects. """ import numpy as np +from . import nputils try: import dask.array as da @@ -24,3 +25,43 @@ def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): # trim array result = da.ghost.trim_internal(out, depth) return result + + +def rolling_window(a, window, axis=-1): + """ Dask's equivalence to np.utils.rolling_window """ + # inputs for ghost + if axis < 0: + axis = a.ndim + axis + depth = {d: 0 for d in range(a.ndim)} + if window % 2 == 0: + depth[axis] = int((window - 1) / 2 + 1) + offset = 1 + else: + depth[axis] = int((window - 1) / 2) + offset = 0 + + if depth[axis] > min(a.chunks[axis]): + raise ValueError( + "The window size %d is larger than your\n" + "smallest chunk size %d + 1. Rechunk your array\n" + "with a larger chunk size or a chunk size that\n" + "more evenly divides the shape of your array." % + (window, min(a.chunks[axis]))) + + boundary = {d: np.nan for d in range(a.ndim)} + # create ghosted arrays + ag = da.ghost.ghost(a, depth=depth, boundary=boundary) + # apply rolling func + def func(x, window, axis=-1): + x = np.asarray(x) + rolling = nputils.rolling_window(x, window, axis) + return rolling[(slice(None), ) * axis + (slice(offset, None), )] + + chunks = list(a.chunks) + chunks.append(window) + out = ag.map_blocks(func, dtype=a.dtype, new_axis=a.ndim, chunks=chunks, + window=window, axis=axis) + # crop the edge points + index = (slice(None),) * axis + (slice(depth[axis] - offset, + - depth[axis]),) + return out[index] diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index dddcfa5d340..3efb923971e 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -18,6 +18,7 @@ from . import npcompat from . import nputils from . import dtypes +from . import dask_array_ops from .pycompat import dask_array_type from .nputils import nanfirst, nanlast @@ -287,16 +288,6 @@ def rolling_window(array, window, axis=-1): The rolling dimension will be placed at the last dimension. """ if isinstance(array, dask_array_type): - if window < 1: - raise ValueError( - "`window` must be at least 1. Given : {}".format(window)) - if window > array.shape[axis]: - raise ValueError("`window` is too long. Given : {}".format(window)) - - axis = nputils._validate_axis(array, axis) - size = array.shape[axis] - window + 1 - arrays = [array[(slice(None), ) * axis + (slice(w, size + w), )] - for w in range(window)] - return da.stack(arrays, axis=-1) + return dask_array_ops.rolling_window(array, window, axis=axis) else: # np.ndarray return nputils.rolling_window(array, window, axis=axis) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 38d3b00158d..0af6c418b8b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -965,12 +965,17 @@ def pad_with_fill_value(self, fill_value=dtypes.NA, **pad_widths): after_shape[axis] = pad[1] after_chunks = list(array.chunks) after_chunks[axis] = (pad[1], ) - array = duck_array_ops.concatenate([ - da.full(before_shape, fill_value, dtype=dtype, - chunks=before_chunks), - array, - da.full(after_shape, fill_value, dtype=dtype, - chunks=after_chunks)], axis=axis) + + arrays = [] + if pad[0] > 0: + arrays.append(da.full(before_shape, fill_value, + dtype=dtype, chunks=before_chunks)) + arrays.append(array) + if pad[1] > 0: + arrays.append(da.full(after_shape, fill_value, + dtype=dtype, chunks=after_chunks)) + if len(arrays) > 1: + array = da.concatenate(arrays, axis=axis) else: pads = [(0, 0) if d not in pad_widths else pad_widths[d] for d in self.dims] diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index a5de44cc403..af49dc28172 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -6,13 +6,19 @@ from numpy import array, nan from . import assert_array_equal from xarray.core.duck_array_ops import ( - first, last, count, mean, array_notnull_equiv, + first, last, count, mean, array_notnull_equiv, rolling_window ) from xarray import DataArray from xarray.core import npcompat +from . import requires_dask from . import TestCase, raises_regex, has_dask +try: + import dask.array as da +except ImportError: + pass + class TestOps(TestCase): def setUp(self): @@ -162,8 +168,6 @@ def test_reduce(dtype, dask, func, skipna, dim): # compatible with pandas se = da.to_dataframe() - print(da) - print(da.reduce(npcompat.nansum)) actual = getattr(da, func)(skipna=skipna) expected = getattr(se, func)(skipna=skipna) assert_allclose_with_nan(actual.values, np.array(expected)) @@ -173,3 +177,17 @@ def test_reduce(dtype, dask, func, skipna, dim): expected = getattr(np, 'nan{}'.format(func))(da.values) actual = getattr(da, func)(skipna=skipna) assert np.allclose(actual.values, np.array(expected)) + + +@requires_dask +@pytest.mark.parametrize('axis', [0, -1]) +@pytest.mark.parametrize('window', [3, 8, 11]) +def test_dask_rolling(axis, window): + x = np.array(np.random.randn(100, 40), dtype=float) + dx = da.from_array(x, chunks=[(6, 30, 30, 20, 14), 8]) + + expected = rolling_window(x, axis=axis, window=window) + actual = rolling_window(dx, axis=axis, window=window) + assert isinstance(actual, da.Array) + assert_array_equal(actual, expected) + assert actual.shape == expected.shape diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 9821b0c0ad3..87c2ecf1655 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -3,6 +3,7 @@ from xarray.core.nputils import (_is_contiguous, NumpyVIndexAdapter, rolling_window) +from xarray.core import dask_array_ops def test_is_contiguous(): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 8507d434de4..3485124c3ac 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -739,6 +739,7 @@ def test_pad(self): expected = np.pad(np.array(v.data.astype(float)), np_arg, mode='constant', constant_values=np.nan) assert_array_equal(actual, expected) + assert type(actual._data) == type(v._data) # for the boolean array, we pad False data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) From d3b1e2bbdf26016eabe6c7ca357488e2eb970abd Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 08:38:50 +0900 Subject: [PATCH 49/73] Optimize rolling.count. --- xarray/core/dask_array_ops.py | 1 + xarray/core/rolling.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index c8ea36f2818..c0793739c4b 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -51,6 +51,7 @@ def rolling_window(a, window, axis=-1): boundary = {d: np.nan for d in range(a.ndim)} # create ghosted arrays ag = da.ghost.ghost(a, depth=depth, boundary=boundary) + # apply rolling func def func(x, window, axis=-1): x = np.asarray(x) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 31de7b01a6b..d1d7ec2160f 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -12,6 +12,24 @@ from . import dtypes +def _get_new_dimname(dims, new_dim): + """ Get an new dimension name based on new_dim, that is not used in dims. + If the same name exists, we add an underscore(s) in the head. + + Example1: + dims: ['a', 'b', 'c'] + new_dim: ['_rolling'] + -> ['_rolling'] + Example2: + dims: ['a', 'b', 'c', '_rolling'] + new_dim: ['_rolling'] + -> ['__rolling'] + """ + while new_dim in dims: + new_dim = '_' + new_dim + return new_dim + + class Rolling(object): """A object that implements the moving window pattern. @@ -228,8 +246,9 @@ def reduce(self, func, **kwargs): reduced : DataArray Array with summarized data. """ - windows = self.to_dataarray('_rolling_window_dim') - result = windows.reduce(func, dim='_rolling_window_dim', **kwargs) + rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim') + windows = self.to_dataarray(rolling_dim) + result = windows.reduce(func, dim=rolling_dim, **kwargs) # Find valid windows based on count. counts = self._counts() @@ -237,10 +256,16 @@ def reduce(self, func, **kwargs): def _counts(self): """ Number of non-nan entries in each rolling window. """ + + rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim') + # We use False as the fill_value instead of np.nan, since boolean + # array is faster to be reduced than object array. + # The use of skipna==False is also faster since it does not need to + # copy the strided array. counts = (self.obj.notnull() .rolling(center=self.center, **{self.dim: self.window}) - .to_dataarray('_rolling_window_dim', fill_value=False) - .sum(dim='_rolling_window_dim')) + .to_dataarray(rolling_dim, fill_value=False) + .sum(dim=rolling_dim, skipna=False)) return counts @classmethod From 734da93a7e01b84d5794888acbb619507ce0bdab Mon Sep 17 00:00:00 2001 From: stickler-ci Date: Thu, 15 Feb 2018 23:51:36 +0000 Subject: [PATCH 50/73] Fixing style errors. --- xarray/core/npcompat.py | 14 ++++++++------ xarray/tests/test_dataarray.py | 12 ++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index bb35aea86eb..f74ec08c87d 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -59,7 +59,6 @@ def _replace_nan(a, val): return a, mask - def nanmin(a, axis=None, out=None, keepdims=np._NoValue): """ Return minimum of an array or minimum along an axis, ignoring any NaNs. @@ -139,7 +138,8 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue): # which do not implement isnan (gh-9009), or fmin correctly (gh-8975) res = np.fmin.reduce(a, axis=axis, out=out, **kwargs) if np.isnan(res).any(): - warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=2) + warnings.warn("All-NaN slice encountered", + RuntimeWarning, stacklevel=2) else: # Slow, but safe for subclasses of ndarray a, mask = _replace_nan(a, +np.inf) @@ -151,10 +151,10 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue): mask = np.all(mask, axis=axis, **kwargs) if np.any(mask): res = _copyto(res, np.nan, mask) - warnings.warn("All-NaN axis encountered", RuntimeWarning, stacklevel=2) + warnings.warn("All-NaN axis encountered", + RuntimeWarning, stacklevel=2) return res - def nanmax(a, axis=None, out=None, keepdims=np._NoValue): """ Return the maximum of an array or maximum along an axis, ignoring any @@ -234,7 +234,8 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue): # which do not implement isnan (gh-9009), or fmax correctly (gh-8975) res = np.fmax.reduce(a, axis=axis, out=out, **kwargs) if np.isnan(res).any(): - warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=2) + warnings.warn("All-NaN slice encountered", + RuntimeWarning, stacklevel=2) else: # Slow, but safe for subclasses of ndarray a, mask = _replace_nan(a, -np.inf) @@ -246,7 +247,8 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue): mask = np.all(mask, axis=axis, **kwargs) if np.any(mask): res = _copyto(res, np.nan, mask) - warnings.warn("All-NaN axis encountered", RuntimeWarning, stacklevel=2) + warnings.warn("All-NaN axis encountered", + RuntimeWarning, stacklevel=2) return res def nansum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c8b212a9518..f33df0b20cf 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3502,12 +3502,12 @@ def test_rolling_count_correct(): {'time': 11, 'min_periods': None}, {'time': 7, 'min_periods': 2}] expecteds = [DataArray( - [1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8], dims='time'), - DataArray( - [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan, np.nan], dims='time'), - DataArray( - [np.nan, np.nan, 2, 3, 3, 4, 5, 5, 5, 5, 5], dims='time')] + [1, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8], dims='time'), + DataArray( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], dims='time'), + DataArray( + [np.nan, np.nan, 2, 3, 3, 4, 5, 5, 5, 5, 5], dims='time')] for kwarg, expected in zip(kwargs, expecteds): result = da.rolling(**kwarg).count() From 1a000b8da0554fda9c835b04634a96b62a88ba37 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 08:56:08 +0900 Subject: [PATCH 51/73] Remove unused npcompat.nansum etc --- xarray/core/npcompat.py | 315 ---------------------------------------- 1 file changed, 315 deletions(-) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index f74ec08c87d..5d15c011ba2 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -14,321 +14,6 @@ def as_strided(x, shape=None, strides=None, subok=False, writeable=True): return array -if LooseVersion(np.__version__) >= LooseVersion('1.13'): - nanmin = np.nanmin - nanmax = np.nanmax - nansum = np.nansum - -else: - def _replace_nan(a, val): - """ - If `a` is of inexact type, make a copy of `a`, replace NaNs with - the `val` value, and return the copy together with a boolean mask - marking the locations where NaNs were present. If `a` is not of - inexact type, do nothing and return `a` together with a mask of None. - Note that scalars will end up as array scalars, which is important - for using the result as the value of the out argument in some - operations. - Parameters - ---------- - a : array-like - Input array. - val : float - NaN values are set to val before doing the operation. - Returns - ------- - y : ndarray - If `a` is of inexact type, return a copy of `a` with the NaNs - replaced by the fill value, otherwise return `a`. - mask: {bool, None} - If `a` is of inexact type, return a boolean mask marking locations of - NaNs, otherwise return None. - """ - a = np.array(a, subok=True, copy=True) - - if a.dtype == np.object_: - # object arrays do not support `isnan` (gh-9009), so make a guess - mask = a != a - elif issubclass(a.dtype.type, np.inexact): - mask = np.isnan(a) - else: - mask = None - - if mask is not None: - np.copyto(a, val, where=mask) - - return a, mask - - def nanmin(a, axis=None, out=None, keepdims=np._NoValue): - """ - Return minimum of an array or minimum along an axis, ignoring any NaNs. - When all-NaN slices are encountered a ``RuntimeWarning`` is raised and - Nan is returned for that slice. - Parameters - ---------- - a : array_like - Array containing numbers whose minimum is desired. If `a` is not an - array, a conversion is attempted. - axis : int, optional - Axis along which the minimum is computed. The default is to compute - the minimum of the flattened array. - out : ndarray, optional - Alternate output array in which to place the result. The default - is ``None``; if provided, it must have the same shape as the - expected output, but the type will be cast if necessary. See - `doc.ufuncs` for details. - .. versionadded:: 1.8.0 - keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `a`. - If the value is anything but the default, then - `keepdims` will be passed through to the `min` method - of sub-classes of `ndarray`. If the sub-classes methods - does not implement `keepdims` any exceptions will be raised. - .. versionadded:: 1.8.0 - Returns - ------- - nanmin : ndarray - An array with the same shape as `a`, with the specified axis - removed. If `a` is a 0-d array, or if axis is None, an ndarray - scalar is returned. The same dtype as `a` is returned. - See Also - -------- - nanmax : - The maximum value of an array along a given axis, ignoring any NaNs. - amin : - The minimum value of an array along a given axis, propagating any NaNs. - fmin : - Element-wise minimum of two arrays, ignoring any NaNs. - minimum : - Element-wise minimum of two arrays, propagating any NaNs. - isnan : - Shows which elements are Not a Number (NaN). - isfinite: - Shows which elements are neither NaN nor infinity. - amax, fmax, maximum - Notes - ----- - NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic - (IEEE 754). This means that Not a Number is not equivalent to infinity. - Positive infinity is treated as a very large number and negative - infinity is treated as a very small (i.e. negative) number. - If the input has a integer type the function is equivalent to np.min. - Examples - -------- - >>> a = np.array([[1, 2], [3, np.nan]]) - >>> np.nanmin(a) - 1.0 - >>> np.nanmin(a, axis=0) - array([ 1., 2.]) - >>> np.nanmin(a, axis=1) - array([ 1., 3.]) - When positive infinity and negative infinity are present: - >>> np.nanmin([1, 2, np.nan, np.inf]) - 1.0 - >>> np.nanmin([1, 2, np.nan, np.NINF]) - -inf - """ - kwargs = {} - if keepdims is not np._NoValue: - kwargs['keepdims'] = keepdims - if type(a) is np.ndarray and a.dtype != np.object_: - # Fast, but not safe for subclasses of ndarray, or object arrays, - # which do not implement isnan (gh-9009), or fmin correctly (gh-8975) - res = np.fmin.reduce(a, axis=axis, out=out, **kwargs) - if np.isnan(res).any(): - warnings.warn("All-NaN slice encountered", - RuntimeWarning, stacklevel=2) - else: - # Slow, but safe for subclasses of ndarray - a, mask = _replace_nan(a, +np.inf) - res = np.amin(a, axis=axis, out=out, **kwargs) - if mask is None: - return res - - # Check for all-NaN axis - mask = np.all(mask, axis=axis, **kwargs) - if np.any(mask): - res = _copyto(res, np.nan, mask) - warnings.warn("All-NaN axis encountered", - RuntimeWarning, stacklevel=2) - return res - - def nanmax(a, axis=None, out=None, keepdims=np._NoValue): - """ - Return the maximum of an array or maximum along an axis, ignoring any - NaNs. When all-NaN slices are encountered a ``RuntimeWarning`` is - raised and NaN is returned for that slice. - Parameters - ---------- - a : array_like - Array containing numbers whose maximum is desired. If `a` is not an - array, a conversion is attempted. - axis : int, optional - Axis along which the maximum is computed. The default is to compute - the maximum of the flattened array. - out : ndarray, optional - Alternate output array in which to place the result. The default - is ``None``; if provided, it must have the same shape as the - expected output, but the type will be cast if necessary. See - `doc.ufuncs` for details. - .. versionadded:: 1.8.0 - keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `a`. - If the value is anything but the default, then - `keepdims` will be passed through to the `max` method - of sub-classes of `ndarray`. If the sub-classes methods - does not implement `keepdims` any exceptions will be raised. - .. versionadded:: 1.8.0 - Returns - ------- - nanmax : ndarray - An array with the same shape as `a`, with the specified axis removed. - If `a` is a 0-d array, or if axis is None, an ndarray scalar is - returned. The same dtype as `a` is returned. - See Also - -------- - nanmin : - The minimum value of an array along a given axis, ignoring any NaNs. - amax : - The maximum value of an array along a given axis, propagating any NaNs. - fmax : - Element-wise maximum of two arrays, ignoring any NaNs. - maximum : - Element-wise maximum of two arrays, propagating any NaNs. - isnan : - Shows which elements are Not a Number (NaN). - isfinite: - Shows which elements are neither NaN nor infinity. - amin, fmin, minimum - Notes - ----- - NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic - (IEEE 754). This means that Not a Number is not equivalent to infinity. - Positive infinity is treated as a very large number and negative - infinity is treated as a very small (i.e. negative) number. - If the input has a integer type the function is equivalent to np.max. - Examples - -------- - >>> a = np.array([[1, 2], [3, np.nan]]) - >>> np.nanmax(a) - 3.0 - >>> np.nanmax(a, axis=0) - array([ 3., 2.]) - >>> np.nanmax(a, axis=1) - array([ 2., 3.]) - When positive infinity and negative infinity are present: - >>> np.nanmax([1, 2, np.nan, np.NINF]) - 2.0 - >>> np.nanmax([1, 2, np.nan, np.inf]) - inf - """ - kwargs = {} - if keepdims is not np._NoValue: - kwargs['keepdims'] = keepdims - if type(a) is np.ndarray and a.dtype != np.object_: - # Fast, but not safe for subclasses of ndarray, or object arrays, - # which do not implement isnan (gh-9009), or fmax correctly (gh-8975) - res = np.fmax.reduce(a, axis=axis, out=out, **kwargs) - if np.isnan(res).any(): - warnings.warn("All-NaN slice encountered", - RuntimeWarning, stacklevel=2) - else: - # Slow, but safe for subclasses of ndarray - a, mask = _replace_nan(a, -np.inf) - res = np.amax(a, axis=axis, out=out, **kwargs) - if mask is None: - return res - - # Check for all-NaN axis - mask = np.all(mask, axis=axis, **kwargs) - if np.any(mask): - res = _copyto(res, np.nan, mask) - warnings.warn("All-NaN axis encountered", - RuntimeWarning, stacklevel=2) - return res - - def nansum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): - """ - Return the sum of array elements over a given axis treating Not a - Numbers (NaNs) as zero. - In NumPy versions <= 1.8.0 Nan is returned for slices that are all-NaN or - empty. In later versions zero is returned. - Parameters - ---------- - a : array_like - Array containing numbers whose sum is desired. If `a` is not an - array, a conversion is attempted. - axis : int, optional - Axis along which the sum is computed. The default is to compute the - sum of the flattened array. - dtype : data-type, optional - The type of the returned array and of the accumulator in which the - elements are summed. By default, the dtype of `a` is used. An - exception is when `a` has an integer type with less precision than - the platform (u)intp. In that case, the default will be either - (u)int32 or (u)int64 depending on whether the platform is 32 or 64 - bits. For inexact inputs, dtype must be inexact. - .. versionadded:: 1.8.0 - out : ndarray, optional - Alternate output array in which to place the result. The default - is ``None``. If provided, it must have the same shape as the - expected output, but the type will be cast if necessary. See - `doc.ufuncs` for details. The casting of NaN to integer can yield - unexpected results. - .. versionadded:: 1.8.0 - keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `a`. - If the value is anything but the default, then - `keepdims` will be passed through to the `mean` or `sum` methods - of sub-classes of `ndarray`. If the sub-classes methods - does not implement `keepdims` any exceptions will be raised. - .. versionadded:: 1.8.0 - Returns - ------- - nansum : ndarray. - A new array holding the result is returned unless `out` is - specified, in which it is returned. The result has the same - size as `a`, and the same shape as `a` if `axis` is not None - or `a` is a 1-d array. - See Also - -------- - numpy.sum : Sum across array propagating NaNs. - isnan : Show which elements are NaN. - isfinite: Show which elements are not NaN or +/-inf. - Notes - ----- - If both positive and negative infinity are present, the sum will be Not - A Number (NaN). - Examples - -------- - >>> np.nansum(1) - 1 - >>> np.nansum([1]) - 1 - >>> np.nansum([1, np.nan]) - 1.0 - >>> a = np.array([[1, 1], [1, np.nan]]) - >>> np.nansum(a) - 3.0 - >>> np.nansum(a, axis=0) - array([ 2., 1.]) - >>> np.nansum([1, np.nan, np.inf]) - inf - >>> np.nansum([1, np.nan, np.NINF]) - -inf - >>> np.nansum([1, np.nan, np.inf, -np.inf]) # both +/- infinity present - nan - """ - a, mask = _replace_nan(a, 0) - return np.sum(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims) - - try: from numpy import nancumsum, nancumprod, flip except ImportError: # pragma: no cover From 27ff67c8cb0d8a4b60fd028fcc8e0058287b6952 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 12:23:44 +0900 Subject: [PATCH 52/73] flake8 --- xarray/core/dask_array_ops.py | 5 +++-- xarray/tests/test_dataarray.py | 6 +++--- xarray/tests/test_nputils.py | 1 - xarray/tests/test_variable.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index c0793739c4b..63b95caab18 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,5 +1,6 @@ -"""Define core operations for xarray objects. -""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import numpy as np from . import nputils diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index f33df0b20cf..7a1c3ea7747 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2716,7 +2716,7 @@ def test_series_categorical_index(self): if not hasattr(pd, 'CategoricalIndex'): raise unittest.SkipTest('requires pandas with CategoricalIndex') - s = pd.Series(range(5), index=pd.CategoricalIndex(list('aabbc'))) + s = pd.Series(np.arange(5), index=pd.CategoricalIndex(list('aabbc'))) arr = DataArray(s) assert "'a'" in repr(arr) # should not error @@ -3403,7 +3403,7 @@ def test_rolling_wrapped_bottleneck_dask(da_dask, name, center, min_periods): @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) def test_rolling_pandas_compat(center, window, min_periods): - s = pd.Series(range(10)) + s = pd.Series(np.arange(10)) da = DataArray.from_series(s) if min_periods is not None and window < min_periods: @@ -3425,7 +3425,7 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) def test_rolling_to_dataarray(center, window): - s = pd.Series(range(10)) + s = pd.Series(np.arange(10)) da = DataArray.from_series(s) s_rolling = s.rolling(window, center=center, min_periods=1).mean() diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 87c2ecf1655..9821b0c0ad3 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -3,7 +3,6 @@ from xarray.core.nputils import (_is_contiguous, NumpyVIndexAdapter, rolling_window) -from xarray.core import dask_array_ops def test_is_contiguous(): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 44f75cd6ab1..96e14c6c50f 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -746,7 +746,7 @@ def test_pad(self): expected = np.pad(np.array(v.data.astype(float)), np_arg, mode='constant', constant_values=np.nan) assert_array_equal(actual, expected) - assert type(actual._data) == type(v._data) + assert isinstance(actual._data, type(v._data)) # for the boolean array, we pad False data = np.full_like(data, False, dtype=bool).reshape(4, 3, 2) From a2c714194021840bc620c24a9bb3239e405b801c Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Fri, 16 Feb 2018 15:34:52 +0900 Subject: [PATCH 53/73] require_dask -> has_dask --- xarray/core/duck_array_ops.py | 9 +++------ xarray/tests/test_duck_array_ops.py | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index f1e8b48edf9..ec78660a178 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -342,12 +342,9 @@ def f(values, axis=None, skipna=None, **kwargs): argmax = _create_nan_agg_method('argmax', coerce_strings=True) argmin = _create_nan_agg_method('argmin', coerce_strings=True) -max = _create_nan_agg_method('max', coerce_strings=True, np_compat=True, - support_object_type=True) -min = _create_nan_agg_method('min', coerce_strings=True, np_compat=True, - support_object_type=True) -sum = _create_nan_agg_method('sum', numeric_only=True, np_compat=True, - support_object_type=True) +max = _create_nan_agg_method('max', coerce_strings=True) +min = _create_nan_agg_method('min', coerce_strings=True) +sum = _create_nan_agg_method('sum', numeric_only=True) mean = _create_nan_agg_method('mean', numeric_only=True) std = _create_nan_agg_method('std', numeric_only=True) var = _create_nan_agg_method('var', numeric_only=True) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 2e1ced880c3..b63a96e1510 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -14,7 +14,6 @@ from xarray.testing import assert_allclose from xarray import concat -from . import requires_dask from . import TestCase, raises_regex, has_dask try: @@ -310,7 +309,7 @@ def test_argmin_max_error(): da.argmin(dim='y') -@requires_dask +@pytest.mark.skipif(not has_dask, reason='This is for dask.') @pytest.mark.parametrize('axis', [0, -1]) @pytest.mark.parametrize('window', [3, 8, 11]) def test_dask_rolling(axis, window): From 35dee9dbc9dc1f4b808458d6bed86e718b80a544 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 17:01:24 +0900 Subject: [PATCH 54/73] npcompat -> np --- xarray/tests/test_dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 7a1c3ea7747..45aaff40d47 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3487,7 +3487,7 @@ def test_rolling_reduce_nonnumeric(center, min_periods, window, name): min_periods=min_periods) # add nan prefix to numpy methods to get similar behavior as bottleneck - actual = rolling_obj.reduce(getattr(npcompat, 'nan%s' % name)) + actual = rolling_obj.reduce(getattr(np, 'nan%s' % name)) expected = getattr(rolling_obj, name)() assert_allclose(actual, expected) assert actual.dims == expected.dims From 137709f62c3b34c6550618dab649e14d95042aa4 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 17:26:10 +0900 Subject: [PATCH 55/73] flake8 --- xarray/tests/test_dataarray.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 45aaff40d47..c6df1e9057c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -16,7 +16,6 @@ from xarray.coding.times import CFDatetimeCoder from xarray.core.pycompat import iteritems, OrderedDict from xarray.core.common import full_like -from xarray.core import npcompat from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, From cc82cdc091eebbf339f3d57973f605208c091250 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Feb 2018 17:45:11 +0900 Subject: [PATCH 56/73] Skip tests for old numpy. --- xarray/tests/test_dataarray.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index c6df1e9057c..d910bc7fee2 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3471,6 +3471,9 @@ def test_rolling_reduce(da, center, min_periods, window, name): assert actual.dims == expected.dims +@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.13'), + reason='Old numpy does not support nansum / nanmax for ' + 'object typed arrays.') @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('min_periods', (None, 1, 2, 3)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) From b24641133991e824135d97c1b73a3418849b75ca Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 17 Feb 2018 19:31:20 +0900 Subject: [PATCH 57/73] Improve doc. Optmize missing._get_valid_fill_mask --- doc/computation.rst | 16 +++++++++++----- doc/whats-new.rst | 5 ++--- xarray/core/duck_array_ops.py | 2 +- xarray/core/missing.py | 7 ++++++- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 1dd278f4e8e..a27d8f82021 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -182,12 +182,18 @@ windowed rolling, convolution, short-time FFT, etc. rolling_da = r.to_dataarray('window_dim') rolling_da # rolling mean with 2-point stride - rolling_da.isel(y=slice(None, None, 2)).mean('window_dim') + rolling_da.isel(y=slice(None, None, 2)).mean('window_dim', skipna=False) + +Because the ``DataArray`` given by ``r.to_dataarray('window_dim')`` is a view +of the original array, it is memory efficient. + +.. note:: + numpy's Nan-aggregation functions such as ``nansum`` copy the original array. + In xarray, we internally use these functions in our aggregation methods + (such as ``.sum()``) if ``skipna`` argument is not specified or set to True. + This means ``rolling_da.mean('window_dim')`` is memory inefficient. + To avoid this, use ``skipna=False`` as the above example. -Note that although the ``DataArray`` obtained by -``r.to_dataarray('window_dim')`` has an additional dimension, -it does not consume too much memory as it is just a view of -the original array. .. _compute.broadcasting: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 55441bf41e2..97f1e1dff78 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,13 +43,12 @@ Documentation Enhancements ~~~~~~~~~~~~ -- Improve :py:func:`~xarray.DataArray.rooling` logic for speed up. - :py:func:`~xarray.DataArrayRolling` object now support ``to_dataarray`` +- Improve :py:func:`~xarray.DataArray.rooling` logic. + :py:func:`~xarray.DataArrayRolling` object now supports ``to_dataarray`` method that returns a view of the DataArray object with the rolling-window dimension added to the last position. This enables more flexible operation, such as strided rolling, windowed rolling, ND-rolling, and convolution. (:issue:`1831`, :issue:`1142`, :issue:`819`) -- reduce methods such as :py:func:`DataArray.sum()` now accept ``dtype`` - Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. .. ipython:: python diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ec78660a178..876c2d14830 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -274,7 +274,7 @@ def _nanvar_object(value, axis=None, **kwargs): def _create_nan_agg_method(name, numeric_only=False, np_compat=False, no_bottleneck=False, coerce_strings=False, - keep_dims=False, support_object_type=False): + keep_dims=False): def f(values, axis=None, skipna=None, **kwargs): if kwargs.pop('out', None) is not None: raise TypeError('`out` is not valid for {}'.format(name)) diff --git a/xarray/core/missing.py b/xarray/core/missing.py index e26e976a11b..e3208c3f53f 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -13,6 +13,7 @@ from .computation import apply_ufunc from .utils import is_scalar from .npcompat import flip +from . import rolling class BaseInterpolator(object): @@ -329,4 +330,8 @@ def _get_valid_fill_mask(arr, dim, limit): '''helper function to determine values that can be filled when limit is not None''' kw = {dim: limit + 1} - return arr.isnull().rolling(min_periods=1, **kw).sum() <= limit + # we explicitly use to_dataarray method to avoid copy. + new_dim = rolling._get_new_dimname(arr.dims, '_window') + return (arr.isnull().rolling(min_periods=1, **kw) + .to_dataarray(new_dim, fill_value=False) + .sum(new_dim, skipna=False)) <= limit From b3a2105d52fb3d1d2e78c35c88b68dfbe1118b30 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 11:23:30 +0900 Subject: [PATCH 58/73] to_dataarray -> construct --- doc/computation.rst | 7 +++---- doc/whats-new.rst | 11 ++++++----- xarray/core/missing.py | 4 ++-- xarray/core/rolling.py | 8 ++++---- xarray/tests/test_dataarray.py | 8 ++++---- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index a27d8f82021..646bdae6561 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -170,8 +170,7 @@ We can also manually iterate through ``Rolling`` objects: for label, arr_window in r: # arr_window is a view of x -Finally, the rolling object has ``to_dataarray`` method -(``to_dataset`` method for Rolling objects from Dataset), which gives a +Finally, the rolling object has ``construct`` method, which gives a view of the original ``DataArray`` with the windowed dimension attached to the last position. You can use this for more advanced rolling operations, such as strided rolling, @@ -179,12 +178,12 @@ windowed rolling, convolution, short-time FFT, etc. .. ipython:: python - rolling_da = r.to_dataarray('window_dim') + rolling_da = r.construct('window_dim') rolling_da # rolling mean with 2-point stride rolling_da.isel(y=slice(None, None, 2)).mean('window_dim', skipna=False) -Because the ``DataArray`` given by ``r.to_dataarray('window_dim')`` is a view +Because the ``DataArray`` given by ``r.construct('window_dim')`` is a view of the original array, it is memory efficient. .. note:: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 97f1e1dff78..c7422df11cc 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -44,11 +44,12 @@ Documentation Enhancements ~~~~~~~~~~~~ - Improve :py:func:`~xarray.DataArray.rooling` logic. - :py:func:`~xarray.DataArrayRolling` object now supports ``to_dataarray`` - method that returns a view of the DataArray object with the rolling-window - dimension added to the last position. This enables more flexible operation, - such as strided rolling, windowed rolling, ND-rolling, and convolution. - (:issue:`1831`, :issue:`1142`, :issue:`819`) + :py:func:`~xarray.DataArrayRolling` object now supports ``construct`` + method that returns a view of the DataArray / Dataset object with the + rolling-window dimension added to the last position. This enables more + flexible operation, such as strided rolling, windowed rolling, ND-rolling, + and convolution. (:issue:`1831`, :issue:`1142`, :issue:`819`) + By `Keisuke Fujii `_. - Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. .. ipython:: python diff --git a/xarray/core/missing.py b/xarray/core/missing.py index e3208c3f53f..957938d837d 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -330,8 +330,8 @@ def _get_valid_fill_mask(arr, dim, limit): '''helper function to determine values that can be filled when limit is not None''' kw = {dim: limit + 1} - # we explicitly use to_dataarray method to avoid copy. + # we explicitly use construct method to avoid copy. new_dim = rolling._get_new_dimname(arr.dims, '_window') return (arr.isnull().rolling(min_periods=1, **kw) - .to_dataarray(new_dim, fill_value=False) + .construct(new_dim, fill_value=False) .sum(new_dim, skipna=False)) <= limit diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index d1d7ec2160f..866f5638446 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -178,7 +178,7 @@ def _setup_windows(self): self.window_indices = [slice(start, stop) for start, stop in zip(starts, stops)] - def to_dataarray(self, window_dim, stride=1, fill_value=dtypes.NA): + def construct(self, window_dim, stride=1, fill_value=dtypes.NA): """ Convert this rolling object to xr.DataArray, where the window dimension is stacked as a new dimension @@ -247,7 +247,7 @@ def reduce(self, func, **kwargs): Array with summarized data. """ rolling_dim = _get_new_dimname(self.obj.dims, '_rolling_dim') - windows = self.to_dataarray(rolling_dim) + windows = self.construct(rolling_dim) result = windows.reduce(func, dim=rolling_dim, **kwargs) # Find valid windows based on count. @@ -264,7 +264,7 @@ def _counts(self): # copy the strided array. counts = (self.obj.notnull() .rolling(center=self.center, **{self.dim: self.window}) - .to_dataarray(rolling_dim, fill_value=False) + .construct(rolling_dim, fill_value=False) .sum(dim=rolling_dim, skipna=False)) return counts @@ -454,7 +454,7 @@ def to_dataset(self, window_dim, stride=1, fill_value=dtypes.NA): dataset = OrderedDict() for key, da in self.obj.data_vars.items(): if self.dim in da.dims: - dataset[key] = self.rollings[key].to_dataarray( + dataset[key] = self.rollings[key].construct( window_dim, fill_value=fill_value) else: dataset[key] = da diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d910bc7fee2..8948bbbd9c4 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3423,25 +3423,25 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -def test_rolling_to_dataarray(center, window): +def test_rolling_construct(center, window): s = pd.Series(np.arange(10)) da = DataArray.from_series(s) s_rolling = s.rolling(window, center=center, min_periods=1).mean() da_rolling = da.rolling(index=window, center=center, min_periods=1) - da_rolling_mean = da_rolling.to_dataarray('window').mean('window') + da_rolling_mean = da_rolling.construct('window').mean('window') np.testing.assert_allclose(s_rolling.values, da_rolling_mean.values) np.testing.assert_allclose(s_rolling.index, da_rolling_mean['index']) # with stride - da_rolling_mean = da_rolling.to_dataarray('window', + da_rolling_mean = da_rolling.construct('window', stride=2).mean('window') np.testing.assert_allclose(s_rolling.values[::2], da_rolling_mean.values) np.testing.assert_allclose(s_rolling.index[::2], da_rolling_mean['index']) # with fill_value - da_rolling_mean = da_rolling.to_dataarray( + da_rolling_mean = da_rolling.construct( 'window', stride=2, fill_value=0.0).mean('window') assert da_rolling_mean.isnull().sum() == 0 assert (da_rolling_mean == 0.0).sum() >= 0 From b80fbfdf7236ab422b62cfa1cebde6113ab1b865 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 11:30:03 +0900 Subject: [PATCH 59/73] remove assert_allclose_with_nan --- xarray/testing.py | 61 ----------------------------------------------- 1 file changed, 61 deletions(-) diff --git a/xarray/testing.py b/xarray/testing.py index f15c8eb3714..f51e474405f 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -27,13 +27,6 @@ def _data_allclose_or_equiv(arr1, arr2, rtol=1e-05, atol=1e-08, arr1, arr2, rtol=rtol, atol=atol) -def _data_allclose_or_equiv_nan(arr1, arr2, rtol=1e-05, atol=1e-08, - decode_bytes=True): - index = (~arr1.isnull()).nonzero() - assert index == (~arr2.isnull()).nonzero() - _data_allclose_or_equiv(arr1[index], arr2[index], rtol, atol, decode_bytes) - - def assert_equal(a, b): """Like :py:func:`numpy.testing.assert_array_equal`, but for xarray objects. @@ -147,57 +140,3 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): else: raise TypeError('{} not supported by assertion comparison' .format(type(a))) - - -def assert_allclose_with_nan(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): - """Like assert_allclose, but except for nan. - - Raises an AssertionError if two objects are not equal up to desired - tolerance. - - Parameters - ---------- - a : xarray.Dataset, xarray.DataArray or xarray.Variable - The first object to compare. - b : xarray.Dataset, xarray.DataArray or xarray.Variable - The second object to compare. - rtol : float, optional - Relative tolerance. - atol : float, optional - Absolute tolerance. - decode_bytes : bool, optional - Whether byte dtypes should be decoded to strings as UTF-8 or not. - This is useful for testing serialization methods on Python 3 that - return saved strings as bytes. - - See also - -------- - assert_identical, assert_equal, numpy.testing.assert_allclose - """ - import xarray as xr - # __tracebackhide__ = True # noqa: F841 - assert type(a) == type(b) - kwargs = dict(rtol=rtol, atol=atol, decode_bytes=decode_bytes) - if isinstance(a, xr.Variable): - assert a.dims == b.dims - allclose = _data_allclose_or_equiv_nan(a.values, b.values, **kwargs) - assert allclose, '{}\n{}'.format(a.values, b.values) - elif isinstance(a, xr.DataArray): - assert_allclose(a.variable, b.variable, **kwargs) - assert set(a.coords) == set(b.coords) - for v in a.coords.variables: - # can't recurse with this function as coord is sometimes a - # DataArray, so call into _data_allclose_or_equiv directly - allclose = _data_allclose_or_equiv_nan( - a.coords[v].values, b.coords[v].values, **kwargs) - assert allclose, '{}\n{}'.format(a.coords[v].values, - b.coords[v].values) - elif isinstance(a, xr.Dataset): - assert set(a.data_vars) == set(b.data_vars) - assert set(a.coords) == set(b.coords) - for k in list(a.variables) + list(a.coords): - assert_allclose_with_nan(a[k], b[k], **kwargs) - - else: - raise TypeError('{} not supported by assertion comparison' - .format(type(a))) From 3c010ae59d6aa98f0a639831c3b55e836b4e9037 Mon Sep 17 00:00:00 2001 From: stickler-ci Date: Sun, 18 Feb 2018 02:30:39 +0000 Subject: [PATCH 60/73] Fixing style errors. --- xarray/tests/test_dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8948bbbd9c4..3d0141a91fb 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3436,7 +3436,7 @@ def test_rolling_construct(center, window): # with stride da_rolling_mean = da_rolling.construct('window', - stride=2).mean('window') + stride=2).mean('window') np.testing.assert_allclose(s_rolling.values[::2], da_rolling_mean.values) np.testing.assert_allclose(s_rolling.index[::2], da_rolling_mean['index']) From ab82f756fb7135bb332233a89118533a1902eba2 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 11:43:56 +0900 Subject: [PATCH 61/73] typo --- xarray/tests/test_variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 96e14c6c50f..4947f96a28f 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -758,7 +758,7 @@ def test_pad(self): assert_array_equal(actual, expected) def test_rolling_window(self): - # Just a working test. See test_nputils fot the algorithm validation + # Just a working test. See test_nputils for the algorithm validation v = self.cls(['x', 'y', 'z'], np.arange(40*30*2).reshape(40, 30, 2)) for (d, w) in [('x', 3), ('y', 5)]: v_rolling = v.rolling_window(d, w, d + '_window') From b9f10cd63483ce2d5eb6421211c62814d83bd2df Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 11:55:38 +0900 Subject: [PATCH 62/73] `to_dataset` -> `construct` --- xarray/core/rolling.py | 2 +- xarray/tests/test_dataset.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 866f5638446..496c88e318b 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -430,7 +430,7 @@ def wrapped_func(self, **kwargs): return Dataset(reduced, coords=self.obj.coords) return wrapped_func - def to_dataset(self, window_dim, stride=1, fill_value=dtypes.NA): + def construct(self, window_dim, stride=1, fill_value=dtypes.NA): """ Convert this rolling object to xr.Dataset, where the window dimension is stacked as a new dimension diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 0c07ceb7d25..d4872442c38 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4140,7 +4140,7 @@ def test_rolling_pandas_compat(center, window, min_periods): @pytest.mark.parametrize('center', (True, False)) @pytest.mark.parametrize('window', (1, 2, 3, 4)) -def test_rolling_to_dataset(center, window): +def test_rolling_construct(center, window): df = pd.DataFrame({'x': np.random.randn(20), 'y': np.random.randn(20), 'time': np.linspace(0, 1, 20)}) @@ -4148,18 +4148,18 @@ def test_rolling_to_dataset(center, window): df_rolling = df.rolling(window, center=center, min_periods=1).mean() ds_rolling = ds.rolling(index=window, center=center) - ds_rolling_mean = ds_rolling.to_dataset('window').mean('window') + ds_rolling_mean = ds_rolling.construct('window').mean('window') np.testing.assert_allclose(df_rolling['x'].values, ds_rolling_mean['x'].values) np.testing.assert_allclose(df_rolling.index, ds_rolling_mean['index']) # with stride - ds_rolling_mean = ds_rolling.to_dataset('window', stride=2).mean('window') + ds_rolling_mean = ds_rolling.construct('window', stride=2).mean('window') np.testing.assert_allclose(df_rolling['x'][::2].values, ds_rolling_mean['x'].values) np.testing.assert_allclose(df_rolling.index[::2], ds_rolling_mean['index']) # with fill_value - ds_rolling_mean = ds_rolling.to_dataset( + ds_rolling_mean = ds_rolling.construct( 'window', stride=2, fill_value=0.0).mean('window') assert ds_rolling_mean.isnull().sum() == 0 assert (ds_rolling_mean['x'] == 0.0).sum() >= 0 From cc9c3d681a2863945d12d467d873a27916c53a89 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 18 Feb 2018 16:34:33 +0900 Subject: [PATCH 63/73] Update doc --- doc/api.rst | 4 ++-- doc/computation.rst | 6 +++--- doc/whats-new.rst | 13 +++++++------ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 7a64d4186f5..4a26298b268 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -474,10 +474,10 @@ Rolling objects :toctree: generated/ core.rolling.DataArrayRolling - core.rolling.DataArrayRolling.to_dataarray + core.rolling.DataArrayRolling.construct core.rolling.DataArrayRolling.reduce core.rolling.DatasetRolling - core.rolling.DatasetRolling.to_dataset + core.rolling.DatasetRolling.construct core.rolling.DatasetRolling.reduce GroupByObjects diff --git a/doc/computation.rst b/doc/computation.rst index 646bdae6561..78c645ff8c3 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -178,10 +178,10 @@ windowed rolling, convolution, short-time FFT, etc. .. ipython:: python - rolling_da = r.construct('window_dim') + # rolling with 2-point stride + rolling_da = r.construct('window_dim', stride=2) rolling_da - # rolling mean with 2-point stride - rolling_da.isel(y=slice(None, None, 2)).mean('window_dim', skipna=False) + rolling_da.mean('window_dim', skipna=False) Because the ``DataArray`` given by ``r.construct('window_dim')`` is a view of the original array, it is memory efficient. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c7422df11cc..0dac61def9b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,12 +43,13 @@ Documentation Enhancements ~~~~~~~~~~~~ -- Improve :py:func:`~xarray.DataArray.rooling` logic. - :py:func:`~xarray.DataArrayRolling` object now supports ``construct`` - method that returns a view of the DataArray / Dataset object with the - rolling-window dimension added to the last position. This enables more - flexible operation, such as strided rolling, windowed rolling, ND-rolling, - and convolution. (:issue:`1831`, :issue:`1142`, :issue:`819`) +- Improve :py:func:`~xarray.DataArray.rolling` logic. + :py:func:`~xarray.DataArrayRolling` object now supports + :py:func:`~xarray.DataArrayRolling.construct` method that returns a view + of the DataArray / Dataset object with the rolling-window dimension added + to the last position. This enables more flexible operation, such as strided + rolling, windowed rolling, ND-rolling, and convolution. + (:issue:`1831`, :issue:`1142`, :issue:`819`) By `Keisuke Fujii `_. - Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array. From 2954cdf72759e5b327f489c8b1795a3229900d38 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 19 Feb 2018 08:51:55 +0900 Subject: [PATCH 64/73] Change boundary and add comments for dask_rolling_window. --- xarray/core/dask_array_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 63b95caab18..f56c1d3cb36 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -49,7 +49,9 @@ def rolling_window(a, window, axis=-1): "more evenly divides the shape of your array." % (window, min(a.chunks[axis]))) - boundary = {d: np.nan for d in range(a.ndim)} + # We temporary use `reflect` boundary here, but the edge portion is + # truncated later. + boundary = {d: 'reflect' for d in range(a.ndim)} # create ghosted arrays ag = da.ghost.ghost(a, depth=depth, boundary=boundary) From f19e5310d06d18ea7466a075c99c2eb48d51b150 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 25 Feb 2018 00:13:04 +0900 Subject: [PATCH 65/73] Refactor dask_array_ops.rolling_window and np_utils.rolling_window --- asv_bench/benchmarks/rolling.py | 4 +-- xarray/core/dask_array_ops.py | 50 +++++++++++++++++++++-------- xarray/core/duck_array_ops.py | 8 +++-- xarray/core/nputils.py | 15 ++++++++- xarray/core/variable.py | 31 +++++++++--------- xarray/tests/test_duck_array_ops.py | 11 +++++-- xarray/tests/test_nputils.py | 17 +++++++--- xarray/tests/test_variable.py | 11 +++++++ 8 files changed, 106 insertions(+), 41 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index c28e5a54f2a..79d06019c00 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -38,8 +38,8 @@ def time_rolling_np(self, window_, min_periods): @parameterized(['center', 'stride'], ([True, False], [1, 200])) - def time_rolling_to_dataset(self, center, stride): - self.ds.rolling(x=window, center=center).to_dataset( + def time_rolling_construct(self, center, stride): + self.ds.rolling(x=window, center=center).construct( 'window_dim', stride=stride).mean(dim='window_dim') diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index f56c1d3cb36..ad02c4a642d 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -28,44 +28,68 @@ def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): return result -def rolling_window(a, window, axis=-1): +def rolling_window(a, axis, window, center, fill_value): """ Dask's equivalence to np.utils.rolling_window """ + orig_shape = a.shape # inputs for ghost if axis < 0: axis = a.ndim + axis depth = {d: 0 for d in range(a.ndim)} - if window % 2 == 0: - depth[axis] = int((window - 1) / 2 + 1) - offset = 1 + depth[axis] = int(window / 2) + + offset = 1 if window % 2 == 0 else 0 + + # pad the original array before the operation in order to avoid copying + # the output array (output array is just a view). + if center: + start = int(window / 2) # 10 -> 5, 9 -> 4 + end = window - 1 - start else: - depth[axis] = int((window - 1) / 2) - offset = 0 + start, end = window - 1, 0 + + drop_size = depth[axis] - offset - np.maximum(start, end) + if drop_size < 0: + # ghosting requires each chunk should be larger than depth. + if -drop_size < depth[axis]: + pad_size = depth[axis] + drop_size = depth[axis] + drop_size + else: + pad_size = -drop_size + drop_size = 0 + shape = list(a.shape) + shape[axis] = pad_size + chunks = list(a.chunks) + chunks[axis] = (pad_size, ) + fill_array = da.full(shape, fill_value, dtype=a.dtype, chunks=chunks) + a = da.concatenate([fill_array, a], axis=axis) if depth[axis] > min(a.chunks[axis]): raise ValueError( - "The window size %d is larger than your\n" - "smallest chunk size %d + 1. Rechunk your array\n" + "For window size %d, every chunk should be larger than %d, " + "but the smallest chunk size is %d. Rechunk your array\n" "with a larger chunk size or a chunk size that\n" "more evenly divides the shape of your array." % - (window, min(a.chunks[axis]))) + (window, depth[axis], min(a.chunks[axis]))) # We temporary use `reflect` boundary here, but the edge portion is # truncated later. - boundary = {d: 'reflect' for d in range(a.ndim)} + boundary = {d: fill_value for d in range(a.ndim)} + # create ghosted arrays ag = da.ghost.ghost(a, depth=depth, boundary=boundary) # apply rolling func def func(x, window, axis=-1): x = np.asarray(x) - rolling = nputils.rolling_window(x, window, axis) + rolling = nputils._rolling_window(x, window, axis) return rolling[(slice(None), ) * axis + (slice(offset, None), )] chunks = list(a.chunks) chunks.append(window) out = ag.map_blocks(func, dtype=a.dtype, new_axis=a.ndim, chunks=chunks, window=window, axis=axis) + # crop the edge points - index = (slice(None),) * axis + (slice(depth[axis] - offset, - - depth[axis]),) + index = (slice(None),) * axis + (slice(drop_size, + drop_size + orig_shape[axis]), ) return out[index] diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 876c2d14830..ac11f00b0f4 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -380,12 +380,14 @@ def last(values, axis, skipna=None): return take(values, -1, axis=axis) -def rolling_window(array, window, axis=-1): +def rolling_window(array, axis, window, center, fill_value): """ Make an ndarray with a rolling window of axis-th dimension. The rolling dimension will be placed at the last dimension. """ if isinstance(array, dask_array_type): - return dask_array_ops.rolling_window(array, window, axis=axis) + return dask_array_ops.rolling_window( + array, axis, window, center, fill_value) else: # np.ndarray - return nputils.rolling_window(array, window, axis=axis) + return nputils.rolling_window( + array, axis, window, center, fill_value) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 3e872082d39..6bafb4b10d0 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -136,7 +136,20 @@ def __setitem__(self, key, value): mixed_positions) -def rolling_window(a, window, axis=-1): +def rolling_window(a, axis, window, center, fill_value): + """ rolling window with padding. """ + pads = [(0, 0) for s in a.shape] + if center: + start = int(window / 2) # 10 -> 5, 9 -> 4 + end = window - 1 - start + pads[axis] = (start, end) + else: + pads[axis] = (window - 1, 0) + a = np.pad(a, pads, mode='constant', constant_values=fill_value) + return _rolling_window(a, window, axis) + + +def _rolling_window(a, window, axis=-1): """ Make an ndarray with a rolling window along axis. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bbec6ed959e..726b81da725 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1519,9 +1519,10 @@ def rolling_window(self, dim, window, window_dim, center=False, window_dim: str New name of the window dimension. center: boolean. default False. - If True, pad np.nan for both ends. Otherwise, pad in the head of - the axis. - + If True, pad fill_value for both ends. Otherwise, pad in the head + of the axis. + fill_value: + value to be filled. Returns ------- @@ -1535,25 +1536,25 @@ def rolling_window(self, dim, window, window_dim, center=False, >>> v=Variable(('a', 'b'), np.arange(8).reshape((2,4))) >>> v.rolling_window(x, 'b', 3, 'window_dim') - array([[[np.nan, np.nan, 0], [np.nan, 0, 1], [0, 1, 2], [1, 2, 3]], - [[np.nan, np.nan, 4], [np.nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) + array([[[nan, nan, 0], [nan, 0, 1], [0, 1, 2], [1, 2, 3]], + [[nan, nan, 4], [nan, 4, 5], [4, 5, 6], [5, 6, 7]]]) >>> v.rolling_window(x, 'b', 3, 'window_dim', center=True) - array([[[np.nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, np.nan]], - [[np.nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, np.nan]]]) + array([[[nan, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, nan]], + [[nan, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, nan]]]) """ - new_dims = self.dims + (window_dim, ) - if center: - start = -int(-window / 2) - end = window - 1 - start - pads = (start, end) + if fill_value is dtypes.NA: # np.nan is passed + dtype, fill_value = dtypes.maybe_promote(self.dtype) + array = self.astype(dtype).data else: - pads = (window - 1, 0) + dtype = self.dtype + array = self.data - array = self.pad_with_fill_value(fill_value=fill_value, **{dim: pads}) + new_dims = self.dims + (window_dim, ) return Variable(new_dims, duck_array_ops.rolling_window( - array.data, axis=self.get_axis_num(dim), window=window)) + array, axis=self.get_axis_num(dim), window=window, + center=center, fill_value=fill_value)) @property def real(self): diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index b63a96e1510..19bf6d6bde8 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -312,12 +312,17 @@ def test_argmin_max_error(): @pytest.mark.skipif(not has_dask, reason='This is for dask.') @pytest.mark.parametrize('axis', [0, -1]) @pytest.mark.parametrize('window', [3, 8, 11]) -def test_dask_rolling(axis, window): +@pytest.mark.parametrize('center', [True, False]) +def test_dask_rolling(axis, window, center): x = np.array(np.random.randn(100, 40), dtype=float) dx = da.from_array(x, chunks=[(6, 30, 30, 20, 14), 8]) - expected = rolling_window(x, axis=axis, window=window) - actual = rolling_window(dx, axis=axis, window=window) + expected = rolling_window(x, axis=axis, window=window, center=center, + fill_value=np.nan) + actual = rolling_window(dx, axis=axis, window=window, center=center, + fill_value=np.nan) assert isinstance(actual, da.Array) + print(actual.compute()[0, :5]) + print(expected[0, :5]) assert_array_equal(actual, expected) assert actual.shape == expected.shape diff --git a/xarray/tests/test_nputils.py b/xarray/tests/test_nputils.py index 9821b0c0ad3..684f56dfd5b 100644 --- a/xarray/tests/test_nputils.py +++ b/xarray/tests/test_nputils.py @@ -32,15 +32,24 @@ def test_vindex(): def test_rolling(): - x = np.array([0, 1, 2, 3, 4], dtype=float) + x = np.array([1, 2, 3, 4], dtype=float) - actual = rolling_window(x, axis=-1, window=3) - expected = np.array([[0, 1, 2], + actual = rolling_window(x, axis=-1, window=3, center=True, + fill_value=np.nan) + expected = np.array([[np.nan, 1, 2], + [1, 2, 3], + [2, 3, 4], + [3, 4, np.nan]], dtype=float) + assert_array_equal(actual, expected) + + actual = rolling_window(x, axis=-1, window=3, center=False, fill_value=0.0) + expected = np.array([[0, 0, 1], + [0, 1, 2], [1, 2, 3], [2, 3, 4]], dtype=float) assert_array_equal(actual, expected) x = np.stack([x, x * 1.1]) - actual = rolling_window(x, axis=-1, window=3) + actual = rolling_window(x, axis=-1, window=3, center=False, fill_value=0.0) expected = np.stack([expected, expected * 1.1], axis=0) assert_array_equal(actual, expected) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cb42ae18c65..3608274938c 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1684,6 +1684,17 @@ def test_getitem_with_mask_nd_indexer(self): assert_identical(v._getitem_with_mask(indexer, fill_value=-1), self.cls(('x', 'y'), [[0, -1], [-1, 2]])) + def test_rolling_window_chunk(self): + # we need to take care of window size if chunk size is small + import dask.array as da + v = Variable(['x'], da.arange(100, chunks=20)) + # should not raise + rw = v.rolling_window(dim='x', window=10, window_dim='x_w', + center=True) + # window/2 should be smaller than the smallest chunk size. + with pytest.raises(ValueError): + rw = v.rolling_window(dim='x', window=100, window_dim='x_w', + center=True) class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From a074df35dda52a887ebe8ff482f41b1fa2bf6f49 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 25 Feb 2018 00:17:40 +0900 Subject: [PATCH 66/73] flake8 --- xarray/tests/test_variable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 3608274938c..cfe736e97ee 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1693,8 +1693,8 @@ def test_rolling_window_chunk(self): center=True) # window/2 should be smaller than the smallest chunk size. with pytest.raises(ValueError): - rw = v.rolling_window(dim='x', window=100, window_dim='x_w', - center=True) + v.rolling_window(dim='x', window=100, window_dim='x_w', + center=True) class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From f6f78a5d940506aca5a8a7f5c19015b1b3e8cc17 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 25 Feb 2018 06:04:47 +0900 Subject: [PATCH 67/73] Simplify tests --- xarray/tests/test_duck_array_ops.py | 6 ++++++ xarray/tests/test_variable.py | 11 ----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 19bf6d6bde8..8774ea09d14 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -326,3 +326,9 @@ def test_dask_rolling(axis, window, center): print(expected[0, :5]) assert_array_equal(actual, expected) assert actual.shape == expected.shape + + # we need to take care of window size if chunk size is small + # window/2 should be smaller than the smallest chunk size. + with pytest.raises(ValueError): + rolling_window(dx, axis=axis, window=100, center=center, + fill_value=np.nan) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cfe736e97ee..cb42ae18c65 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1684,17 +1684,6 @@ def test_getitem_with_mask_nd_indexer(self): assert_identical(v._getitem_with_mask(indexer, fill_value=-1), self.cls(('x', 'y'), [[0, -1], [-1, 2]])) - def test_rolling_window_chunk(self): - # we need to take care of window size if chunk size is small - import dask.array as da - v = Variable(['x'], da.arange(100, chunks=20)) - # should not raise - rw = v.rolling_window(dim='x', window=10, window_dim='x_w', - center=True) - # window/2 should be smaller than the smallest chunk size. - with pytest.raises(ValueError): - v.rolling_window(dim='x', window=100, window_dim='x_w', - center=True) class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable) From 0ec8aba2a1c9ab225e5095ef4b32e9ef26bd3395 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sun, 25 Feb 2018 11:45:34 +0900 Subject: [PATCH 68/73] flake8 again. --- xarray/tests/test_variable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index cb42ae18c65..408de5f1fb1 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -765,7 +765,8 @@ def test_pad(self): def test_rolling_window(self): # Just a working test. See test_nputils for the algorithm validation - v = self.cls(['x', 'y', 'z'], np.arange(40*30*2).reshape(40, 30, 2)) + v = self.cls(['x', 'y', 'z'], + np.arange(40 * 30 * 2).reshape(40, 30, 2)) for (d, w) in [('x', 3), ('y', 5)]: v_rolling = v.rolling_window(d, w, d + '_window') assert v_rolling.dims == ('x', 'y', 'z', d + '_window') From 0261cfe4f659ee449ee8cc40ab361bbbbb28eefd Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 25 Feb 2018 19:16:39 +0900 Subject: [PATCH 69/73] cleanup roling_window for dask. --- xarray/core/dask_array_ops.py | 46 ++++++++++++++--------------- xarray/core/variable.py | 6 ++-- xarray/tests/test_duck_array_ops.py | 2 -- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index ad02c4a642d..b13adf48c08 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -36,26 +36,36 @@ def rolling_window(a, axis, window, center, fill_value): axis = a.ndim + axis depth = {d: 0 for d in range(a.ndim)} depth[axis] = int(window / 2) - + # For evenly sized window, we need to crop the first point of each block. offset = 1 if window % 2 == 0 else 0 - # pad the original array before the operation in order to avoid copying - # the output array (output array is just a view). + if depth[axis] > min(a.chunks[axis]): + raise ValueError( + "For window size %d, every chunk should be larger than %d, " + "but the smallest chunk size is %d. Rechunk your array\n" + "with a larger chunk size or a chunk size that\n" + "more evenly divides the shape of your array." % + (window, depth[axis], min(a.chunks[axis]))) + + # Although dask.ghost pads values to boundaries of the array, + # the size of the generated array is smaller than what we want + # if center == False. if center: start = int(window / 2) # 10 -> 5, 9 -> 4 end = window - 1 - start else: start, end = window - 1, 0 - - drop_size = depth[axis] - offset - np.maximum(start, end) - if drop_size < 0: - # ghosting requires each chunk should be larger than depth. - if -drop_size < depth[axis]: + pad_size = max(start, end) + offset - depth[axis] + drop_size = 0 + # pad_size becomes more than 0 when the ghosted array is smaller than + # needed. In this case, we need to enlarge the original array by padding + # before ghosting. + if pad_size > 0: + if pad_size < depth[axis]: + # Ghosting requires each chunk larger than depth. If pad_size is + # smaller than the depth, we enlarge this and truncate it later. + drop_size = depth[axis] - pad_size pad_size = depth[axis] - drop_size = depth[axis] + drop_size - else: - pad_size = -drop_size - drop_size = 0 shape = list(a.shape) shape[axis] = pad_size chunks = list(a.chunks) @@ -63,16 +73,6 @@ def rolling_window(a, axis, window, center, fill_value): fill_array = da.full(shape, fill_value, dtype=a.dtype, chunks=chunks) a = da.concatenate([fill_array, a], axis=axis) - if depth[axis] > min(a.chunks[axis]): - raise ValueError( - "For window size %d, every chunk should be larger than %d, " - "but the smallest chunk size is %d. Rechunk your array\n" - "with a larger chunk size or a chunk size that\n" - "more evenly divides the shape of your array." % - (window, depth[axis], min(a.chunks[axis]))) - - # We temporary use `reflect` boundary here, but the edge portion is - # truncated later. boundary = {d: fill_value for d in range(a.ndim)} # create ghosted arrays @@ -89,7 +89,7 @@ def func(x, window, axis=-1): out = ag.map_blocks(func, dtype=a.dtype, new_axis=a.ndim, chunks=chunks, window=window, axis=axis) - # crop the edge points + # crop boundary. index = (slice(None),) * axis + (slice(drop_size, drop_size + orig_shape[axis]), ) return out[index] diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 726b81da725..c536b7374ab 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -983,8 +983,8 @@ def pad_with_fill_value(self, fill_value=dtypes.NA, **pad_widths): else: pads = [(0, 0) if d not in pad_widths else pad_widths[d] for d in self.dims] - array = np.pad(self.data.astype(dtype), pads, mode='constant', - constant_values=fill_value) + array = np.pad(self.data.astype(dtype, copy=False), pads, + mode='constant', constant_values=fill_value) return type(self)(self.dims, array) def _roll_one_dim(self, dim, count): @@ -1546,7 +1546,7 @@ def rolling_window(self, dim, window, window_dim, center=False, """ if fill_value is dtypes.NA: # np.nan is passed dtype, fill_value = dtypes.maybe_promote(self.dtype) - array = self.astype(dtype).data + array = self.astype(dtype, copy=False).data else: dtype = self.dtype array = self.data diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 8774ea09d14..7a4ec134b04 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -322,8 +322,6 @@ def test_dask_rolling(axis, window, center): actual = rolling_window(dx, axis=axis, window=window, center=center, fill_value=np.nan) assert isinstance(actual, da.Array) - print(actual.compute()[0, :5]) - print(expected[0, :5]) assert_array_equal(actual, expected) assert actual.shape == expected.shape From c83d58851cbaf8538f3278dd0da908afe73027c7 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 26 Feb 2018 13:59:16 +0900 Subject: [PATCH 70/73] remove duplicates --- doc/examples/multidimensional_lookup.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 doc/examples/multidimensional_lookup.rst diff --git a/doc/examples/multidimensional_lookup.rst b/doc/examples/multidimensional_lookup.rst new file mode 100644 index 00000000000..918bfd11294 --- /dev/null +++ b/doc/examples/multidimensional_lookup.rst @@ -0,0 +1,20 @@ +.. _examples.multidim_lookup: + +Multidimensional Lookup with Vectorized Indexing +================================================= + +Author: `Keisuke Fujii `__ + +:ref:`vectorized_indexing` can be used to project object to another coordinate by nearest neighbor lookup. + +.. ipython:: python + + import numpy as np + import pandas as pd + import xarray as xr + import netCDF4 + import cartopy.crs as ccrs + import matplotlib.pyplot as plt + + +ds.sel(latitude=latitude_grid, longitude=longitude_grid, method='nearest', tolerance=0.1). From 3bb46689c62cc2ff8ac08af3f84b96a5b43d6794 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 26 Feb 2018 14:24:58 +0900 Subject: [PATCH 71/73] remvove duplicate --- xarray/tests/test_duck_array_ops.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 51c5ec284c1..e54a477a653 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -326,26 +326,3 @@ def test_dask_rolling(axis, window, center): with pytest.raises(ValueError): rolling_window(dx, axis=axis, window=100, center=center, fill_value=np.nan) - - -@pytest.mark.skipif(not has_dask, reason='This is for dask.') -@pytest.mark.parametrize('axis', [0, -1]) -@pytest.mark.parametrize('window', [3, 8, 11]) -@pytest.mark.parametrize('center', [True, False]) -def test_dask_rolling(axis, window, center): - x = np.array(np.random.randn(100, 40), dtype=float) - dx = da.from_array(x, chunks=[(6, 30, 30, 20, 14), 8]) - - expected = rolling_window(x, axis=axis, window=window, center=center, - fill_value=np.nan) - actual = rolling_window(dx, axis=axis, window=window, center=center, - fill_value=np.nan) - assert isinstance(actual, da.Array) - assert_array_equal(actual, expected) - assert actual.shape == expected.shape - - # we need to take care of window size if chunk size is small - # window/2 should be smaller than the smallest chunk size. - with pytest.raises(ValueError): - rolling_window(dx, axis=axis, window=100, center=center, - fill_value=np.nan) From d0d89ce67fa96ca0953301817e1c09b538305287 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 26 Feb 2018 17:06:53 +0900 Subject: [PATCH 72/73] flake8 --- xarray/tests/test_duck_array_ops.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index e54a477a653..f256fc7156c 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -10,12 +10,11 @@ first, last, count, mean, array_notnull_equiv, where, stack, concatenate, rolling_window ) -from xarray.core.pycompat import dask_array_type from xarray import DataArray -from xarray.testing import assert_allclose, assert_equal +from xarray.testing import assert_allclose from xarray import concat -from . import TestCase, raises_regex, has_dask, requires_dask +from . import TestCase, raises_regex, has_dask try: import dask.array as da From eaba563116b957ebdd056ef259015776d40548da Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Mon, 26 Feb 2018 18:05:17 +0900 Subject: [PATCH 73/73] delete unnecessary file. --- doc/examples/multidimensional_lookup.rst | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 doc/examples/multidimensional_lookup.rst diff --git a/doc/examples/multidimensional_lookup.rst b/doc/examples/multidimensional_lookup.rst deleted file mode 100644 index 918bfd11294..00000000000 --- a/doc/examples/multidimensional_lookup.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _examples.multidim_lookup: - -Multidimensional Lookup with Vectorized Indexing -================================================= - -Author: `Keisuke Fujii `__ - -:ref:`vectorized_indexing` can be used to project object to another coordinate by nearest neighbor lookup. - -.. ipython:: python - - import numpy as np - import pandas as pd - import xarray as xr - import netCDF4 - import cartopy.crs as ccrs - import matplotlib.pyplot as plt - - -ds.sel(latitude=latitude_grid, longitude=longitude_grid, method='nearest', tolerance=0.1).