From 0e3089c12ddc32eeed26f5c1bfebb36447146659 Mon Sep 17 00:00:00 2001 From: Spencer Jones <41342785+cspencerjones@users.noreply.github.com> Date: Fri, 23 Aug 2019 10:00:39 -0700 Subject: [PATCH 01/43] Updater to testing environment name (#3253) The testing environment name has been updated to `xarray-tests` in the package and we should do this in the documentation as well. --- doc/contributing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/contributing.rst b/doc/contributing.rst index 429c282a95f..9017c3dde7c 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -152,10 +152,10 @@ We'll now kick off a two-step process: # Create and activate the build environment conda env create -f ci/requirements/py36.yml - conda activate test_env + conda activate xarray-tests # or with older versions of Anaconda: - source activate test_env + source activate xarray-tests # Build and install xarray pip install -e . From 55b33cd95b1967f6518bbd1ecf648a5370ee5a6c Mon Sep 17 00:00:00 2001 From: Rick Russotto Date: Fri, 23 Aug 2019 14:56:13 -0400 Subject: [PATCH 02/43] Clarify apply_ufunc error message (Issue #2078) (#3119) * Clarify error message (#2078) * Update whats-new * Add issue number to whats-new * Update xarray/core/computation.py Co-Authored-By: Deepak Cherian * black computation.py for formatting conventions --- doc/whats-new.rst | 5 ++++- xarray/core/computation.py | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 705c54b2d30..f73e7700750 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -80,6 +80,9 @@ Enhancements Bug fixes ~~~~~~~~~ +- Improve "missing dimensions" error message for :py:func:`~xarray.apply_ufunc` + (:issue:`2078`). + By `Rick Russotto `_. - :py:meth:`~xarray.DataArray.assign_coords` now supports dictionary arguments (:issue:`3231`). By `Gregory Gundersen `_. @@ -106,7 +109,7 @@ Bug fixes - Fix error that arises when using open_mfdataset on a series of netcdf files having differing values for a variable attribute of type list. (:issue:`3034`) By `Hasan Ahmad `_. - + .. _whats-new.0.12.3: Documentation diff --git a/xarray/core/computation.py b/xarray/core/computation.py index cb3a0d5db7d..bb1e4648b30 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -502,9 +502,10 @@ def broadcast_compat_data(variable, broadcast_dims, core_dims): missing_core_dims = [d for d in core_dims if d not in set_old_dims] if missing_core_dims: raise ValueError( - "operand to apply_ufunc has required core dimensions %r, but " - "some of these are missing on the input variable: %r" - % (list(core_dims), missing_core_dims) + "operand to apply_ufunc has required core dimensions {}, but " + "some of these dimensions are absent on an input variable: {}".format( + list(core_dims), missing_core_dims + ) ) set_new_dims = set(new_dims) From 79dc7dc461c7540cc0b84a98543c6f7796c05268 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 23 Aug 2019 22:33:03 +0200 Subject: [PATCH 03/43] One-off isort run (#3196) * One-off, manually vetted and tweaked isort run * More isort tweaks * Resilience to isort * isort run * Another isort run; fix sparse test * Clean up unused imports --- asv_bench/benchmarks/combine.py | 1 + setup.cfg | 2 - xarray/backends/api.py | 11 ++-- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/netCDF4_.py | 2 +- xarray/core/alignment.py | 6 +- xarray/core/combine.py | 4 +- xarray/core/computation.py | 3 +- xarray/core/concat.py | 2 +- xarray/core/coordinates.py | 6 +- xarray/core/dask_array_compat.py | 1 - xarray/core/dataarray.py | 6 +- xarray/core/dataset.py | 9 +-- xarray/core/duck_array_ops.py | 2 +- xarray/core/groupby.py | 4 +- xarray/core/indexing.py | 2 +- xarray/core/merge.py | 2 +- xarray/core/npcompat.py | 2 +- xarray/core/pdcompat.py | 2 +- xarray/core/utils.py | 1 - xarray/core/variable.py | 4 +- xarray/testing.py | 5 +- xarray/tests/__init__.py | 2 +- xarray/tests/test_backends.py | 4 +- xarray/tests/test_coding_times.py | 2 +- xarray/tests/test_combine.py | 18 ++++-- xarray/tests/test_concat.py | 1 + xarray/tests/test_dataarray.py | 1 - xarray/tests/test_dataset.py | 2 +- xarray/tests/test_duck_array_ops.py | 2 +- xarray/tests/test_formatting.py | 2 +- xarray/tests/test_plot.py | 1 - xarray/tests/test_sparse.py | 86 ++++++++++++++++------------- xarray/util/print_versions.py | 1 - 34 files changed, 104 insertions(+), 97 deletions(-) diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 9314361e998..aa9662d44f9 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -1,4 +1,5 @@ import numpy as np + import xarray as xr diff --git a/setup.cfg b/setup.cfg index 6cb58d2b9a2..114f71f4a9f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,8 +25,6 @@ ignore= E731 # line break before binary operator W503 - # Unused imports; TODO: Allow typing to work without triggering errors - F401 exclude= doc diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 887af0023fb..9ad1db1829b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -6,6 +6,7 @@ from pathlib import Path from textwrap import dedent from typing import ( + TYPE_CHECKING, Callable, Dict, Hashable, @@ -13,21 +14,19 @@ Mapping, Tuple, Union, - TYPE_CHECKING, ) import numpy as np -from .. import Dataset, DataArray, backends, conventions, coding +from .. import DataArray, Dataset, auto_combine, backends, coding, conventions from ..core import indexing -from .. import auto_combine from ..core.combine import ( - combine_by_coords, - _nested_combine, _infer_concat_order_from_positions, + _nested_combine, + combine_by_coords, ) from ..core.utils import close_on_error, is_grib_path, is_remote_uri -from .common import ArrayWriter, AbstractDataStore +from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler if TYPE_CHECKING: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index edc28c7b0ff..0c5fe9087d2 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -5,7 +5,7 @@ from .. import Variable from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error +from ..core.utils import FrozenOrderedDict from .common import WritableCFDataStore from .file_manager import CachingFileManager from .locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 9866a2fe344..57317a7a1a5 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -10,7 +10,7 @@ from .. import Variable, coding from ..coding.variables import pop_to from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error, is_remote_uri +from ..core.utils import FrozenOrderedDict, is_remote_uri from .common import ( BackendArray, WritableCFDataStore, diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index bb44f48fb9b..9aeef63e891 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -3,7 +3,7 @@ import warnings from collections import OrderedDict, defaultdict from contextlib import suppress -from typing import Any, Dict, Hashable, Mapping, Optional, Tuple, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Hashable, Mapping, Optional, Tuple, Union import numpy as np import pandas as pd @@ -14,8 +14,8 @@ from .variable import IndexVariable, Variable if TYPE_CHECKING: - from .dataarray import DataArray - from .dataset import Dataset + from .dataarray import DataArray # noqa: F401 + from .dataset import Dataset # noqa: F401 def _get_joiner(join): diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3aae12c3b66..c24be88b19e 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -5,10 +5,10 @@ import pandas as pd +from . import dtypes +from .concat import concat from .dataarray import DataArray from .dataset import Dataset -from .concat import concat -from . import dtypes from .merge import merge diff --git a/xarray/core/computation.py b/xarray/core/computation.py index bb1e4648b30..da97106098f 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -7,6 +7,7 @@ from collections import Counter, OrderedDict from distutils.version import LooseVersion from typing import ( + TYPE_CHECKING, AbstractSet, Any, Callable, @@ -17,7 +18,6 @@ Sequence, Tuple, Union, - TYPE_CHECKING, ) import numpy as np @@ -649,7 +649,6 @@ def func(*arrays): def _apply_blockwise( func, args, input_dims, output_dims, signature, output_dtypes, output_sizes=None ): - import dask.array as da from .dask_array_compat import blockwise if signature.num_outputs > 1: diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 014b615f2a7..9c7c622a31c 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -3,7 +3,7 @@ import pandas as pd -from . import utils, dtypes +from . import dtypes, utils from .alignment import align from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 562d30dd6c7..82488f252f4 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -4,12 +4,12 @@ TYPE_CHECKING, Any, Hashable, - Mapping, Iterator, - Union, + Mapping, + Sequence, Set, Tuple, - Sequence, + Union, cast, ) diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py index 5d4ff849b57..fe2cdc5c553 100644 --- a/xarray/core/dask_array_compat.py +++ b/xarray/core/dask_array_compat.py @@ -4,7 +4,6 @@ import numpy as np from dask import __version__ as dask_version - try: blockwise = da.blockwise except AttributeError: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 52c11429e2b..f147a97d39b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4,6 +4,7 @@ from collections import OrderedDict from numbers import Number from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -17,7 +18,6 @@ Union, cast, overload, - TYPE_CHECKING, ) import numpy as np @@ -38,9 +38,9 @@ from .accessor_dt import DatetimeAccessor from .accessor_str import StringAccessor from .alignment import ( - align, _broadcast_helper, _get_broadcast_dims_map_common_coords, + align, reindex_like_indexers, ) from .common import AbstractArray, DataWithCoords @@ -54,7 +54,7 @@ from .formatting import format_item from .indexes import Indexes, default_indexes from .options import OPTIONS -from .utils import _check_inplace, either_dict_or_kwargs, ReprObject +from .utils import ReprObject, _check_inplace, either_dict_or_kwargs from .variable import ( IndexVariable, Variable, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 076b97e8623..e54b9ad3ba5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7,6 +7,7 @@ from numbers import Number from pathlib import Path from typing import ( + TYPE_CHECKING, Any, Callable, DefaultDict, @@ -24,14 +25,15 @@ Union, cast, overload, - TYPE_CHECKING, ) import numpy as np import pandas as pd + import xarray as xr from ..coding.cftimeindex import _parse_array_of_cftime_strings +from ..plot.dataset_plot import _Dataset_PlotMethods from . import ( alignment, dtypes, @@ -45,7 +47,7 @@ rolling, utils, ) -from .alignment import align, _broadcast_helper, _get_broadcast_dims_map_common_coords +from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .common import ( ALL_DIMS, DataWithCoords, @@ -53,8 +55,8 @@ _contains_datetime_like_objects, ) from .coordinates import ( - DatasetCoordinates, DataArrayCoordinates, + DatasetCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers, @@ -79,7 +81,6 @@ maybe_wrap_array, ) from .variable import IndexVariable, Variable, as_variable, broadcast_variables -from ..plot.dataset_plot import _Dataset_PlotMethods if TYPE_CHECKING: from ..backends import AbstractDataStore, ZarrStore diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 3d7e7cc64bc..fcd0400566f 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,7 +13,7 @@ from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast -from .pycompat import dask_array_type, sparse_array_type +from .pycompat import dask_array_type try: import dask.array as dask_array diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 3ed3491b582..5d81b13983d 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -7,16 +7,16 @@ from . import dtypes, duck_array_ops, nputils, ops, utils from .arithmetic import SupportsArithmetic -from .concat import concat from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce +from .concat import concat from .options import _get_keep_attrs from .pycompat import integer_types from .utils import ( + either_dict_or_kwargs, hashable, maybe_wrap_array, peek_at, safe_cast_to_index, - either_dict_or_kwargs, ) from .variable import IndexVariable, Variable, as_variable diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c5c3cadf7a2..d5cd5eb9e8f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -3,7 +3,7 @@ from collections import defaultdict from contextlib import suppress from datetime import timedelta -from typing import Any, Tuple, Sequence, Union +from typing import Any, Sequence, Tuple, Union import numpy as np import pandas as pd diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 882667dbaaa..225507b9204 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -1,5 +1,6 @@ from collections import OrderedDict from typing import ( + TYPE_CHECKING, Any, Dict, Hashable, @@ -11,7 +12,6 @@ Set, Tuple, Union, - TYPE_CHECKING, ) import pandas as pd diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index ecaadae726e..22c14d9ff40 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -29,10 +29,10 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import builtins +import operator from distutils.version import LooseVersion from typing import Union -import operator import numpy as np try: diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 654a43b505e..91998482e3e 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -38,10 +38,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from distutils.version import LooseVersion + import numpy as np import pandas as pd - # allow ourselves to type checks for Panel even after it's removed if LooseVersion(pd.__version__) < "0.25.0": Panel = pd.Panel diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ba478686d61..bf8c9a264e3 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -31,7 +31,6 @@ from .pycompat import dask_array_type - K = TypeVar("K") V = TypeVar("V") T = TypeVar("T") diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 4c095f3a062..aea4b211bbd 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -3,7 +3,7 @@ from collections import OrderedDict, defaultdict from datetime import timedelta from distutils.version import LooseVersion -from typing import Any, Hashable, Mapping, MutableMapping, Union +from typing import Any, Hashable, Mapping, Union import numpy as np import pandas as pd @@ -18,9 +18,9 @@ VectorizedIndexer, as_indexable, ) +from .npcompat import IS_NEP18_ACTIVE from .options import _get_keep_attrs from .pycompat import dask_array_type, integer_types -from .npcompat import IS_NEP18_ACTIVE from .utils import ( OrderedSet, decode_numpy_dict_values, diff --git a/xarray/testing.py b/xarray/testing.py index 3c92eef04c6..fbb5904c678 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -5,12 +5,11 @@ import numpy as np import pandas as pd -from xarray.core import duck_array_ops -from xarray.core import formatting +from xarray.core import duck_array_ops, formatting from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset -from xarray.core.variable import IndexVariable, Variable from xarray.core.indexes import default_indexes +from xarray.core.variable import IndexVariable, Variable def _decode_string_data(data): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 044ba75e87f..fb4f8200e08 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -1,4 +1,5 @@ import importlib +import platform import re import warnings from contextlib import contextmanager @@ -32,7 +33,6 @@ except ImportError: pass -import platform arm_xfail = pytest.mark.xfail( platform.machine() == "aarch64" or "arm" in platform.machine(), diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dd102f8e2e1..e76cb3aecf7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -38,6 +38,7 @@ from xarray.tests import mock from . import ( + arm_xfail, assert_allclose, assert_array_equal, assert_equal, @@ -61,14 +62,13 @@ requires_scipy, requires_scipy_or_netCDF4, requires_zarr, - arm_xfail, ) from .test_coding_times import ( _ALL_CALENDARS, _NON_STANDARD_CALENDARS, _STANDARD_CALENDARS, ) -from .test_dataset import create_test_data, create_append_test_data +from .test_dataset import create_append_test_data, create_test_data try: import netCDF4 as nc4 diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index ab5ed20d531..615a7e00172 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -19,13 +19,13 @@ from xarray.testing import assert_equal from . import ( + arm_xfail, assert_array_equal, has_cftime, has_cftime_or_netCDF4, has_dask, requires_cftime, requires_cftime_or_netCDF4, - arm_xfail, ) try: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index e3801d02bc8..f786a851e62 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,23 +1,29 @@ from collections import OrderedDict -from itertools import product from datetime import datetime +from itertools import product import numpy as np import pytest -from xarray import DataArray, Dataset, concat, combine_by_coords, combine_nested -from xarray import auto_combine +from xarray import ( + DataArray, + Dataset, + auto_combine, + combine_by_coords, + combine_nested, + concat, +) from xarray.core import dtypes from xarray.core.combine import ( - _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, - _infer_concat_order_from_positions, _infer_concat_order_from_coords, + _infer_concat_order_from_positions, + _new_tile_id, ) -from . import assert_identical, assert_equal, raises_regex +from . import assert_equal, assert_identical, raises_regex from .test_dataset import create_test_data diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 4adcc0d5c49..b8ab89e926c 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -6,6 +6,7 @@ from xarray import DataArray, Dataset, Variable, concat from xarray.core import dtypes + from . import ( InaccessibleArray, assert_array_equal, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 506c437c2bf..d2355e28f6e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -24,7 +24,6 @@ assert_identical, raises_regex, requires_bottleneck, - requires_cftime, requires_dask, requires_iris, requires_np113, diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 55358e47e41..f76dea86776 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,8 +31,8 @@ from xarray.core.pycompat import integer_types from . import ( - LooseVersion, InaccessibleArray, + LooseVersion, UnexpectedDataAccess, assert_allclose, assert_array_equal, diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index ec63c9651eb..725cfe3d506 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -25,13 +25,13 @@ from xarray.testing import assert_allclose, assert_equal from . import ( + arm_xfail, assert_array_equal, has_dask, has_np113, raises_regex, requires_cftime, requires_dask, - arm_xfail, ) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 56fba20ffc0..c518f528537 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from textwrap import dedent import sys +from textwrap import dedent import numpy as np import pandas as pd diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 36e7a38151d..a1c05971ec4 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -8,7 +8,6 @@ import xarray as xr import xarray.plot as xplt from xarray import DataArray, Dataset -from xarray.coding.times import _import_cftime from xarray.plot.dataset_plot import _infer_meta_data from xarray.plot.plot import _infer_interval_breaks from xarray.plot.utils import ( diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 74805b225fa..c94ee1b2978 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -1,16 +1,16 @@ -from textwrap import dedent import pickle +from textwrap import dedent + import numpy as np import pandas as pd +import pytest -from xarray import DataArray, Variable -from xarray.core.npcompat import IS_NEP18_ACTIVE import xarray as xr import xarray.ufuncs as xu +from xarray import DataArray, Variable +from xarray.core.npcompat import IS_NEP18_ACTIVE -from . import assert_equal, assert_identical, LooseVersion - -import pytest +from . import assert_equal, assert_identical param = pytest.param xfail = pytest.mark.xfail @@ -21,8 +21,6 @@ ) sparse = pytest.importorskip("sparse") -from sparse.utils import assert_eq as assert_sparse_eq # noqa -from sparse import COO, SparseArray # noqa def make_ndarray(shape): @@ -239,7 +237,7 @@ def test_variable_method(func, sparse_output): ret_d = func(var_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -265,7 +263,7 @@ def test_1d_variable_method(func, sparse_output): ret_d = func(var_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data) else: assert np.allclose(ret_s, ret_d) @@ -278,16 +276,16 @@ def setUp(self): self.var = xr.Variable(("x", "y"), self.data) def test_unary_op(self): - assert_sparse_eq(-self.var.data, -self.data) - assert_sparse_eq(abs(self.var).data, abs(self.data)) - assert_sparse_eq(self.var.round().data, self.data.round()) + sparse.utils.assert_eq(-self.var.data, -self.data) + sparse.utils.assert_eq(abs(self.var).data, abs(self.data)) + sparse.utils.assert_eq(self.var.round().data, self.data.round()) def test_univariate_ufunc(self): - assert_sparse_eq(np.sin(self.data), xu.sin(self.var).data) + sparse.utils.assert_eq(np.sin(self.data), xu.sin(self.var).data) def test_bivariate_ufunc(self): - assert_sparse_eq(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) - assert_sparse_eq(np.maximum(self.data, 0), xu.maximum(0, self.var).data) + sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) + sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(0, self.var).data) def test_repr(self): expected = dedent( @@ -300,12 +298,12 @@ def test_repr(self): def test_pickle(self): v1 = self.var v2 = pickle.loads(pickle.dumps(v1)) - assert_sparse_eq(v1.data, v2.data) + sparse.utils.assert_eq(v1.data, v2.data) @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_missing_values(self): a = np.array([0, 1, np.nan, 3]) - s = COO.from_numpy(a) + s = sparse.COO.from_numpy(a) var_s = Variable("x", s) assert np.all(var_s.fillna(2).data.todense() == np.arange(4)) assert np.all(var_s.count() == 3) @@ -558,7 +556,7 @@ def test_dataarray_method(func, sparse_output): ret_d = func(arr_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -582,7 +580,7 @@ def test_datarray_1d_method(func, sparse_output): ret_d = func(arr_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -607,10 +605,14 @@ def test_to_dataset_roundtrip(self): def test_align(self): a1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "c", "d"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "c", "d"]}, ) b1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "d", "e"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "d", "e"]}, ) a2, b2 = xr.align(a1, b1, join="inner") assert isinstance(a2.data, sparse.SparseArray) @@ -650,10 +652,14 @@ def test_align_2d(self): @pytest.mark.xfail(reason="fill value leads to sparse-dense operation") def test_align_outer(self): a1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "c", "d"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "c", "d"]}, ) b1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "d", "e"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "d", "e"]}, ) a2, b2 = xr.align(a1, b1, join="outer") assert isinstance(a2.data, sparse.SparseArray) @@ -667,13 +673,13 @@ def test_concat(self): ds2 = xr.Dataset(data_vars={"d": self.sp_xr}) ds3 = xr.Dataset(data_vars={"d": self.sp_xr}) out = xr.concat([ds1, ds2, ds3], dim="x") - assert_sparse_eq( + sparse.utils.assert_eq( out["d"].data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=0), ) out = xr.concat([self.sp_xr, self.sp_xr, self.sp_xr], dim="y") - assert_sparse_eq( + sparse.utils.assert_eq( out.data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=1) ) @@ -698,9 +704,9 @@ def test_ufuncs(self): def test_dataarray_repr(self): a = xr.DataArray( - COO.from_numpy(np.ones(4)), + sparse.COO.from_numpy(np.ones(4)), dims=["x"], - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) expected = dedent( """\ @@ -714,8 +720,8 @@ def test_dataarray_repr(self): def test_dataset_repr(self): ds = xr.Dataset( - data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}, - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) expected = dedent( """\ @@ -731,7 +737,9 @@ def test_dataset_repr(self): def test_sparse_dask_dataset_repr(self): pytest.importorskip("dask", minversion="2.0") - ds = xr.Dataset(data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}).chunk() + ds = xr.Dataset( + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))} + ).chunk() expected = dedent( """\ @@ -744,17 +752,17 @@ def test_sparse_dask_dataset_repr(self): def test_dataarray_pickle(self): a1 = xr.DataArray( - COO.from_numpy(np.ones(4)), + sparse.COO.from_numpy(np.ones(4)), dims=["x"], - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) a2 = pickle.loads(pickle.dumps(a1)) assert_identical(a1, a2) def test_dataset_pickle(self): ds1 = xr.Dataset( - data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}, - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) ds2 = pickle.loads(pickle.dumps(ds1)) assert_identical(ds1, ds2) @@ -829,7 +837,7 @@ def test_resample(self): dims="time", ) t2 = t1.copy() - t2.data = COO(t2.data) + t2.data = sparse.COO(t2.data) m1 = t1.resample(time="QS-DEC").mean() m2 = t2.resample(time="QS-DEC").mean() assert isinstance(m2.data, sparse.SparseArray) @@ -860,7 +868,7 @@ def test_where(self): cond = a > 3 xr.DataArray(a).where(cond) - s = COO.from_numpy(a) + s = sparse.COO.from_numpy(a) cond = s > 3 xr.DataArray(s).where(cond) @@ -873,9 +881,9 @@ class TestSparseCoords: @pytest.mark.xfail(reason="Coercion of coords to dense") def test_sparse_coords(self): xr.DataArray( - COO.from_numpy(np.arange(4)), + sparse.COO.from_numpy(np.arange(4)), dims=["x"], - coords={"x": COO.from_numpy([1, 2, 3, 4])}, + coords={"x": sparse.COO.from_numpy([1, 2, 3, 4])}, ) diff --git a/xarray/util/print_versions.py b/xarray/util/print_versions.py index 85bb9db8360..4ba327913bc 100755 --- a/xarray/util/print_versions.py +++ b/xarray/util/print_versions.py @@ -1,5 +1,4 @@ """Utility functions for printing version information.""" -import codecs import importlib import locale import os From 487b0876e014d7e97caeff9737ae2154c7645a73 Mon Sep 17 00:00:00 2001 From: Gregory Gundersen Date: Fri, 23 Aug 2019 22:26:21 +0100 Subject: [PATCH 04/43] Drop keyword support: small fixes (#3233) * Amended docs for new API and deprecation warnings; deprecated dropping DataArrayCoordinates (#2910). * Removed okwarning now that docs have non-deprecated API. * More clear warnings and more tests. * Moved both warnings to top of function for code clarity. * Removed unused imports. * Update dataset.py --- doc/indexing.rst | 3 +-- doc/whats-new.rst | 4 +++- xarray/core/dataset.py | 30 +++++++++++++++++++----------- xarray/tests/test_dataset.py | 9 ++++++++- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 4c5b93db0b4..58700d7b572 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -236,9 +236,8 @@ The :py:meth:`~xarray.Dataset.drop` method returns a new object with the listed index labels along a dimension dropped: .. ipython:: python - :okwarning: - ds.drop(['IN', 'IL'], dim='space') + ds.drop(space=['IN', 'IL']) ``drop`` is both a ``Dataset`` and ``DataArray`` method. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f73e7700750..4101268be36 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -69,7 +69,9 @@ Enhancements By `Ulrich Herter `_. - :py:meth:`~xarray.Dataset.drop` now supports keyword arguments; dropping index - labels by specifying both ``dim`` and ``labels`` is deprecated (:issue:`2910`). + labels by using both ``dim`` and ``labels`` or using a + :py:class:`~xarray.core.coordinates.DataArrayCoordinates` object are + deprecated (:issue:`2910`). By `Gregory Gundersen `_. - Added examples of :py:meth:`Dataset.set_index` and diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e54b9ad3ba5..9c1a2559c18 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -55,7 +55,6 @@ _contains_datetime_like_objects, ) from .coordinates import ( - DataArrayCoordinates, DatasetCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, @@ -79,6 +78,8 @@ either_dict_or_kwargs, hashable, maybe_wrap_array, + is_dict_like, + is_list_like, ) from .variable import IndexVariable, Variable, as_variable, broadcast_variables @@ -3555,9 +3556,23 @@ def drop( # noqa: F811 if errors not in ["raise", "ignore"]: raise ValueError('errors must be either "raise" or "ignore"') - labels_are_coords = isinstance(labels, DataArrayCoordinates) - if labels_kwargs or (utils.is_dict_like(labels) and not labels_are_coords): - labels_kwargs = utils.either_dict_or_kwargs(labels, labels_kwargs, "drop") + if is_dict_like(labels) and not isinstance(labels, dict): + warnings.warn( + "dropping coordinates using key values of dict-like labels is " + "deprecated; use drop_vars or a list of coordinates.", + FutureWarning, + stacklevel=2, + ) + if dim is not None and is_list_like(labels): + warnings.warn( + "dropping dimensions using list-like labels is deprecated; use " + "dict-like arguments.", + DeprecationWarning, + stacklevel=2, + ) + + if labels_kwargs or isinstance(labels, dict): + labels_kwargs = either_dict_or_kwargs(labels, labels_kwargs, "drop") if dim is not None: raise ValueError("cannot specify dim and dict-like arguments.") ds = self @@ -3571,13 +3586,6 @@ def drop( # noqa: F811 labels = set(labels) return self._drop_vars(labels, errors=errors) else: - if utils.is_list_like(labels): - warnings.warn( - "dropping dimensions using list-like labels is deprecated; " - "use dict-like arguments.", - DeprecationWarning, - stacklevel=2, - ) return self._drop_labels(labels, dim, errors=errors) def _drop_labels(self, labels=None, dim=None, errors="raise"): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f76dea86776..18d1dade30d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2214,7 +2214,7 @@ def test_drop_labels_by_keyword(self): # Basic functionality. assert len(data.coords["x"]) == 2 - # This API is allowed but deprecated. + # In the future, this will break. with pytest.warns(DeprecationWarning): ds1 = data.drop(["a"], dim="x") ds2 = data.drop(x="a") @@ -2222,6 +2222,13 @@ def test_drop_labels_by_keyword(self): ds4 = data.drop(x=["a", "b"]) ds5 = data.drop(x=["a", "b"], y=range(0, 6, 2)) + # In the future, this will result in different behavior. + arr = DataArray(range(3), dims=["c"]) + with pytest.warns(FutureWarning): + data.drop(arr.coords) + with pytest.warns(FutureWarning): + data.drop(arr.indexes) + assert_array_equal(ds1.coords["x"], ["b"]) assert_array_equal(ds2.coords["x"], ["b"]) assert_array_equal(ds3.coords["x"], ["b"]) From 011f7ccbdb69d82d8e513589b9d03be613f09979 Mon Sep 17 00:00:00 2001 From: Spencer Jones <41342785+cspencerjones@users.noreply.github.com> Date: Fri, 23 Aug 2019 16:25:59 -0700 Subject: [PATCH 05/43] Update filter_by_attrs to use 'variables' instead of 'data_vars' (#3247) * Update filter_by_attrs to use 'variables' instead of 'data_vars' This will allow `filter_by_attrs` to filter coordinates as well as variables. @ocefpaf * added tests for filter_by_attrs acting on coords * fixed pep8 issues * Change standard_name to long_name in filter_by_attrs test * remove whitespace --- xarray/core/dataset.py | 2 +- xarray/tests/test_dataset.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9c1a2559c18..a09a0076b4e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5148,7 +5148,7 @@ def filter_by_attrs(self, **kwargs): """ # noqa selection = [] - for var_name, variable in self.data_vars.items(): + for var_name, variable in self.variables.items(): has_value_flag = False for attr_name, pattern in kwargs.items(): attr_value = variable.attrs.get(attr_name) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 18d1dade30d..4636b963acd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4927,7 +4927,7 @@ def test_filter_by_attrs(self): "temperature_10": (["t"], [0], temp10), "precipitation": (["t"], [0], precip), }, - coords={"time": (["t"], [0], dict(axis="T"))}, + coords={"time": (["t"], [0], dict(axis="T", long_name="time_in_seconds"))}, ) # Test return empty Dataset. @@ -4941,6 +4941,11 @@ def test_filter_by_attrs(self): assert_equal(new_ds["precipitation"], ds["precipitation"]) + # Test filter coordinates + new_ds = ds.filter_by_attrs(long_name="time_in_seconds") + assert new_ds["time"].long_name == "time_in_seconds" + assert not bool(new_ds.data_vars) + # Test return more than one DataArray. new_ds = ds.filter_by_attrs(standard_name="air_potential_temperature") assert len(new_ds.data_vars) == 2 From 3faee2bfe06518dd301bd14eccd58166b4296bbc Mon Sep 17 00:00:00 2001 From: Spencer Jones <41342785+cspencerjones@users.noreply.github.com> Date: Fri, 23 Aug 2019 20:20:59 -0700 Subject: [PATCH 06/43] New feature of filter_by_attrs added (#3259) I missed this in my previous pull request. @dcherian --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4101268be36..d029f1263ea 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,6 +78,8 @@ Enhancements :py:meth:`DataArray.set_index`, as well are more specific error messages when the user passes invalid arguments (:issue:`3176`). By `Gregory Gundersen `_. + +- :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. By `Spencer Jones `_. Bug fixes ~~~~~~~~~ From 3f1b879c93426825f8c378f043afff8259dc8d28 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 23 Aug 2019 22:08:56 -0700 Subject: [PATCH 07/43] Fix sparse ops that were calling bottleneck (#3254) min and max are now working. notnull was already fixed by one of my earlier PRs. std/var/median are still broken, but only because sparse hasn't implemented the corresponding NumPy functions yet (nanstd, nanvar and nanmedian). rank needs pure NumPy implementation (not via bottleneck) if we want it to work on sparse or dask arrays. --- xarray/core/nputils.py | 1 + xarray/core/variable.py | 12 ++++++++--- xarray/tests/test_sparse.py | 40 ++++++++++++++++++++++--------------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index a9971e7125a..769af03fe6a 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -209,6 +209,7 @@ def f(values, axis=None, **kwargs): if ( _USE_BOTTLENECK + and isinstance(values, np.ndarray) and bn_func is not None and not isinstance(axis, tuple) and values.dtype.kind in "uifc" diff --git a/xarray/core/variable.py b/xarray/core/variable.py index aea4b211bbd..bc8da10dd0c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1697,18 +1697,24 @@ def rank(self, dim, pct=False): """ import bottleneck as bn - if isinstance(self.data, dask_array_type): + data = self.data + + if isinstance(data, dask_array_type): raise TypeError( "rank does not work for arrays stored as dask " "arrays. Load the data via .compute() or .load() " "prior to calling this method." ) + elif not isinstance(data, np.ndarray): + raise TypeError( + "rank is not implemented for {} objects.".format(type(data)) + ) axis = self.get_axis_num(dim) func = bn.nanrankdata if self.dtype.kind == "f" else bn.rankdata - ranked = func(self.data, axis=axis) + ranked = func(data, axis=axis) if pct: - count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True) + count = np.sum(~np.isnan(data), axis=axis, keepdims=True) ranked /= count return Variable(self.dims, ranked) diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index c94ee1b2978..36decf49713 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -171,11 +171,13 @@ def test_variable_property(prop): False, marks=xfail(reason="'COO' object has no attribute 'item'"), ), - param(do("max"), False, marks=xfail(reason="Coercion to dense via bottleneck")), param( - do("median"), False, marks=xfail(reason="Coercion to dense via bottleneck") + do("median"), + False, + marks=xfail(reason="Missing implementation for np.nanmedian"), ), - param(do("min"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param(do("max"), False), + param(do("min"), False), param( do("no_conflicts", other=make_xrvar({"x": 10, "y": 5})), True, @@ -199,7 +201,7 @@ def test_variable_property(prop): param( do("rank", dim="x"), False, - marks=xfail(reason="Coercion to dense via bottleneck"), + marks=xfail(reason="Only implemented for NumPy arrays (via bottleneck)"), ), param( do("reduce", func=np.sum, dim="x"), @@ -214,13 +216,17 @@ def test_variable_property(prop): param( do("shift", x=2), True, marks=xfail(reason="mixed sparse-dense operation") ), - param(do("std"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param( + do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") + ), param( do("sum"), False, marks=xfail(reason="Missing implementation for np.result_type"), ), - param(do("var"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param( + do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") + ), param(do("to_dict"), False, marks=xfail(reason="Coercion to dense")), param( do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), @@ -476,16 +482,14 @@ def test_dataarray_property(prop): False, marks=xfail(reason="'COO' object has no attribute 'item'"), ), - param(do("max"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param(do("max"), False), + param(do("min"), False), param( - do("median"), False, marks=xfail(reason="Coercion to dense via bottleneck") - ), - param(do("min"), False, marks=xfail(reason="Coercion to dense via bottleneck")), - param( - do("notnull"), + do("median"), False, - marks=xfail(reason="'COO' object has no attribute 'notnull'"), + marks=xfail(reason="Missing implementation for np.nanmedian"), ), + param(do("notnull"), True), param( do("pipe", np.sum, axis=1), True, @@ -504,7 +508,7 @@ def test_dataarray_property(prop): param( do("rank", "x"), False, - marks=xfail(reason="Coercion to dense via bottleneck"), + marks=xfail(reason="Only implemented for NumPy arrays (via bottleneck)"), ), param( do("reduce", np.sum, dim="x"), @@ -532,13 +536,17 @@ def test_dataarray_property(prop): True, marks=xfail(reason="Indexing COO with more than one iterable index"), ), # noqa - param(do("std"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param( + do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") + ), param( do("sum"), False, marks=xfail(reason="Missing implementation for np.result_type"), ), - param(do("var"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param( + do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") + ), param( do("where", make_xrarray({"x": 10, "y": 5}) > 0.5), False, From 5f55d41a05618e6091061dfb83fe745ed6008997 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sun, 25 Aug 2019 13:55:46 -0400 Subject: [PATCH 08/43] Remove sel_points (#3261) * remove .[i ]sel_points * dangling decorator * whatsnew --- doc/indexing.rst | 8 -- doc/whats-new.rst | 8 ++ xarray/core/dataarray.py | 26 ---- xarray/core/dataset.py | 209 --------------------------------- xarray/tests/test_dataarray.py | 58 --------- xarray/tests/test_dataset.py | 110 ----------------- 6 files changed, 8 insertions(+), 411 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 58700d7b572..9ee8f1dddf8 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -392,14 +392,6 @@ These methods may also be applied to ``Dataset`` objects You may find increased performance by loading your data into memory first, e.g., with :py:meth:`~xarray.Dataset.load`. -.. note:: - - Vectorized indexing is a new feature in v0.10. - In older versions of xarray, dimensions of indexers are ignored. - Dedicated methods for some advanced indexing use cases, - ``isel_points`` and ``sel_points`` are now deprecated. - See :ref:`more_advanced_indexing` for their alternative. - .. note:: If an indexer is a :py:meth:`~xarray.DataArray`, its coordinates should not diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d029f1263ea..7ce60dc6e81 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,14 @@ v0.13.0 (unreleased) This release increases the minimum required Python version from 3.5.0 to 3.5.3 (:issue:`3089`). By `Guido Imperiale `_. +Breaking changes +~~~~~~~~~~~~~~~~ + + The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated + since v0.10.0. These are redundant with the ``isel`` / ``sel`` methods. + See :ref:`vectorized_indexing` for the details + By `Maximilian Roos `_ + New functions/methods ~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f147a97d39b..4bd80553588 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1026,32 +1026,6 @@ def sel( ) return self._from_temp_dataset(ds) - def isel_points(self, dim="points", **indexers) -> "DataArray": - """Return a new DataArray whose data is given by pointwise integer - indexing along the specified dimension(s). - - See Also - -------- - Dataset.isel_points - """ - ds = self._to_temp_dataset().isel_points(dim=dim, **indexers) - return self._from_temp_dataset(ds) - - def sel_points( - self, dim="points", method=None, tolerance=None, **indexers - ) -> "DataArray": - """Return a new DataArray whose dataset is given by pointwise selection - of index labels along the specified dimension(s). - - See Also - -------- - Dataset.sel_points - """ - ds = self._to_temp_dataset().sel_points( - dim=dim, method=method, tolerance=tolerance, **indexers - ) - return self._from_temp_dataset(ds) - def broadcast_like( self, other: Union["DataArray", Dataset], exclude: Iterable[Hashable] = None ) -> "DataArray": diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a09a0076b4e..4cb5d1454ce 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -40,7 +40,6 @@ duck_array_ops, formatting, groupby, - indexing, ops, pdcompat, resample, @@ -1999,214 +1998,6 @@ def sel( result = self.isel(indexers=pos_indexers, drop=drop) return result._overwrite_indexes(new_indexes) - def isel_points(self, dim: Any = "points", **indexers: Any) -> "Dataset": - """Returns a new dataset with each array indexed pointwise along the - specified dimension(s). - - This method selects pointwise values from each array and is akin to - the NumPy indexing behavior of `arr[[0, 1], [0, 1]]`, except this - method does not require knowing the order of each array's dimensions. - - Parameters - ---------- - dim : hashable or DataArray or pandas.Index or other list-like object, - optional - Name of the dimension to concatenate along. If dim is provided as a - hashable, it must be a new dimension name, in which case it is added - along axis=0. If dim is provided as a DataArray or Index or - list-like object, its name, which must not be present in the - dataset, is used as the dimension to concatenate along and the - values are added as a coordinate. - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by array-like objects. All indexers must be the same length and - 1 dimensional. - - Returns - ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - array and dimension is indexed by the appropriate indexers. With - pointwise indexing, the new Dataset will always be a copy of the - original. - - See Also - -------- - Dataset.sel - Dataset.isel - Dataset.sel_points - DataArray.isel_points - """ # noqa - warnings.warn( - "Dataset.isel_points is deprecated: use Dataset.isel()" "instead.", - DeprecationWarning, - stacklevel=2, - ) - - indexer_dims = set(indexers) - - def take(variable, slices): - # Note: remove helper function when once when numpy - # supports vindex https://github.com/numpy/numpy/pull/6075 - if hasattr(variable.data, "vindex"): - # Special case for dask backed arrays to use vectorised list - # indexing - sel = variable.data.vindex[slices] - else: - # Otherwise assume backend is numpy array with 'fancy' indexing - sel = variable.data[slices] - return sel - - def relevant_keys(mapping): - return [ - k for k, v in mapping.items() if any(d in indexer_dims for d in v.dims) - ] - - coords = relevant_keys(self.coords) - indexers = {k: np.asarray(v) for k, v in indexers.items()} - non_indexed_dims = set(self.dims) - indexer_dims - non_indexed_coords = set(self.coords) - set(coords) - - # All the indexers should be iterables - # Check that indexers are valid dims, integers, and 1D - for k, v in indexers.items(): - if k not in self.dims: - raise ValueError("dimension %s does not exist" % k) - if v.dtype.kind != "i": # type: ignore - raise TypeError("Indexers must be integers") - if v.ndim != 1: # type: ignore - raise ValueError("Indexers must be 1 dimensional") - - # all the indexers should have the same length - lengths = {len(v) for k, v in indexers.items()} - if len(lengths) > 1: - raise ValueError("All indexers must be the same length") - - # Existing dimensions are not valid choices for the dim argument - if isinstance(dim, str): - if dim in self.dims: - # dim is an invalid string - raise ValueError( - "Existing dimension names are not valid " - "choices for the dim argument in sel_points" - ) - - elif hasattr(dim, "dims"): - # dim is a DataArray or Coordinate - if dim.name in self.dims: - # dim already exists - raise ValueError( - "Existing dimensions are not valid choices " - "for the dim argument in sel_points" - ) - - # Set the new dim_name, and optionally the new dim coordinate - # dim is either an array-like or a string - if not utils.is_scalar(dim): - # dim is array like get name or assign 'points', get as variable - dim_name = "points" if not hasattr(dim, "name") else dim.name - dim_coord = as_variable(dim, name=dim_name) - else: - # dim is a string - dim_name = dim - dim_coord = None # type: ignore - - reordered = self.transpose(*list(indexer_dims), *list(non_indexed_dims)) - - variables = OrderedDict() # type: ignore - - for name, var in reordered.variables.items(): - if name in indexers or any(d in indexer_dims for d in var.dims): - # slice if var is an indexer or depends on an indexed dim - slc = [indexers.get(k, slice(None)) for k in var.dims] - - var_dims = [dim_name] + [d for d in var.dims if d in non_indexed_dims] - selection = take(var, tuple(slc)) - var_subset = type(var)(var_dims, selection, var.attrs) - variables[name] = var_subset - else: - # If not indexed just add it back to variables or coordinates - variables[name] = var - - coord_names = (set(coords) & set(variables)) | non_indexed_coords - - dset = self._replace_vars_and_dims(variables, coord_names=coord_names) - # Add the dim coord to the new dset. Must be done after creation - # because_replace_vars_and_dims can only access existing coords, - # not add new ones - if dim_coord is not None: - dset.coords[dim_name] = dim_coord - return dset - - def sel_points( - self, - dim: Any = "points", - method: str = None, - tolerance: Number = None, - **indexers: Any - ): - """Returns a new dataset with each array indexed pointwise by tick - labels along the specified dimension(s). - - In contrast to `Dataset.isel_points`, indexers for this method should - use labels instead of integers. - - In contrast to `Dataset.sel`, this method selects points along the - diagonal of multi-dimensional arrays, not the intersection. - - Parameters - ---------- - dim : hashable or DataArray or pandas.Index or other list-like object, - optional - Name of the dimension to concatenate along. If dim is provided as a - hashable, it must be a new dimension name, in which case it is added - along axis=0. If dim is provided as a DataArray or Index or - list-like object, its name, which must not be present in the - dataset, is used as the dimension to concatenate along and the - values are added as a coordinate. - method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional - Method to use for inexact matches (requires pandas>=0.16): - - * None (default): only exact matches - * pad / ffill: propagate last valid index value forward - * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations must - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by array-like objects. All indexers must be the same length and - 1 dimensional. - - Returns - ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - array and dimension is indexed by the appropriate indexers. With - pointwise indexing, the new Dataset will always be a copy of the - original. - - See Also - -------- - Dataset.sel - Dataset.isel - Dataset.isel_points - DataArray.sel_points - """ # noqa - warnings.warn( - "Dataset.sel_points is deprecated: use Dataset.sel()" "instead.", - DeprecationWarning, - stacklevel=2, - ) - - pos_indexers, _ = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance - ) - return self.isel_points(dim=dim, **pos_indexers) - def broadcast_like( self, other: Union["Dataset", "DataArray"], exclude: Iterable[Hashable] = None ) -> "Dataset": diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d2355e28f6e..6a68fb73837 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1001,64 +1001,6 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_isel_points(self): - shape = (10, 5, 6) - np_array = np.random.random(shape) - da = DataArray( - np_array, dims=["time", "y", "x"], coords={"time": np.arange(0, 100, 10)} - ) - y = [1, 3] - x = [3, 0] - - expected = da.values[:, y, x] - - actual = da.isel_points(y=y, x=x, dim="test_coord") - assert actual.coords["test_coord"].shape == (len(y),) - assert list(actual.coords) == ["time"] - assert actual.dims == ("test_coord", "time") - - actual = da.isel_points(y=y, x=x) - assert "points" in actual.dims - # Note that because xarray always concatenates along the first - # dimension, We must transpose the result to match the numpy style of - # concatenation. - np.testing.assert_equal(actual.T, expected) - - # a few corner cases - da.isel_points(time=[1, 2], x=[2, 2], y=[3, 4]) - np.testing.assert_allclose( - da.isel_points(time=[1], x=[2], y=[4]).values.squeeze(), - np_array[1, 4, 2].squeeze(), - ) - da.isel_points(time=[1, 2]) - y = [-1, 0] - x = [-2, 2] - expected = da.values[:, y, x] - actual = da.isel_points(x=x, y=y).values - np.testing.assert_equal(actual.T, expected) - - # test that the order of the indexers doesn't matter - assert_identical(da.isel_points(y=y, x=x), da.isel_points(x=x, y=y)) - - # make sure we're raising errors in the right places - with raises_regex(ValueError, "All indexers must be the same length"): - da.isel_points(y=[1, 2], x=[1, 2, 3]) - with raises_regex(ValueError, "dimension bad_key does not exist"): - da.isel_points(bad_key=[1, 2]) - with raises_regex(TypeError, "Indexers must be integers"): - da.isel_points(y=[1.5, 2.2]) - with raises_regex(TypeError, "Indexers must be integers"): - da.isel_points(x=[1, 2, 3], y=slice(3)) - with raises_regex(ValueError, "Indexers must be 1 dimensional"): - da.isel_points(y=1, x=2) - with raises_regex(ValueError, "Existing dimension names are not"): - da.isel_points(y=[1, 2], x=[1, 2], dim="x") - - # using non string dims - actual = da.isel_points(y=[1, 2], x=[1, 2], dim=["A", "B"]) - assert "points" in actual.coords - def test_loc(self): self.ds["x"] = ("x", np.array(list("abcdefghij"))) da = self.ds["foo"] diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 4636b963acd..82ae65d955f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1410,116 +1410,6 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_isel_points(self): - data = create_test_data() - - pdim1 = [1, 2, 3] - pdim2 = [4, 5, 1] - pdim3 = [1, 2, 3] - actual = data.isel_points(dim1=pdim1, dim2=pdim2, dim3=pdim3, dim="test_coord") - assert "test_coord" in actual.dims - assert actual.coords["test_coord"].shape == (len(pdim1),) - - actual = data.isel_points(dim1=pdim1, dim2=pdim2) - assert "points" in actual.dims - assert "dim3" in actual.dims - assert "dim3" not in actual.data_vars - np.testing.assert_array_equal(data["dim2"][pdim2], actual["dim2"]) - - # test that the order of the indexers doesn't matter - assert_identical( - data.isel_points(dim1=pdim1, dim2=pdim2), - data.isel_points(dim2=pdim2, dim1=pdim1), - ) - - # make sure we're raising errors in the right places - with raises_regex(ValueError, "All indexers must be the same length"): - data.isel_points(dim1=[1, 2], dim2=[1, 2, 3]) - with raises_regex(ValueError, "dimension bad_key does not exist"): - data.isel_points(bad_key=[1, 2]) - with raises_regex(TypeError, "Indexers must be integers"): - data.isel_points(dim1=[1.5, 2.2]) - with raises_regex(TypeError, "Indexers must be integers"): - data.isel_points(dim1=[1, 2, 3], dim2=slice(3)) - with raises_regex(ValueError, "Indexers must be 1 dimensional"): - data.isel_points(dim1=1, dim2=2) - with raises_regex(ValueError, "Existing dimension names are not valid"): - data.isel_points(dim1=[1, 2], dim2=[1, 2], dim="dim2") - - # test to be sure we keep around variables that were not indexed - ds = Dataset({"x": [1, 2, 3, 4], "y": 0}) - actual = ds.isel_points(x=[0, 1, 2]) - assert_identical(ds["y"], actual["y"]) - - # tests using index or DataArray as a dim - stations = Dataset() - stations["station"] = ("station", ["A", "B", "C"]) - stations["dim1s"] = ("station", [1, 2, 3]) - stations["dim2s"] = ("station", [4, 5, 1]) - - actual = data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=stations["station"] - ) - assert "station" in actual.coords - assert "station" in actual.dims - assert_identical(actual["station"].drop(["dim2"]), stations["station"]) - - # make sure we get the default 'points' coordinate when passed a list - actual = data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=["A", "B", "C"] - ) - assert "points" in actual.coords - assert actual.coords["points"].values.tolist() == ["A", "B", "C"] - - # test index - actual = data.isel_points( - dim1=stations["dim1s"].values, - dim2=stations["dim2s"].values, - dim=pd.Index(["A", "B", "C"], name="letters"), - ) - assert "letters" in actual.coords - - # can pass a numpy array - data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=np.array([4, 5, 6]) - ) - - @pytest.mark.filterwarnings("ignore:Dataset.sel_points") - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_sel_points(self): - data = create_test_data() - - # add in a range() index - data["dim1"] = data.dim1 - - pdim1 = [1, 2, 3] - pdim2 = [4, 5, 1] - pdim3 = [1, 2, 3] - expected = data.isel_points( - dim1=pdim1, dim2=pdim2, dim3=pdim3, dim="test_coord" - ) - actual = data.sel_points( - dim1=data.dim1[pdim1], - dim2=data.dim2[pdim2], - dim3=data.dim3[pdim3], - dim="test_coord", - ) - assert_identical(expected, actual) - - data = Dataset({"foo": (("x", "y"), np.arange(9).reshape(3, 3))}) - expected = Dataset({"foo": ("points", [0, 4, 8])}) - actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) - assert_identical(expected, actual) - - data.coords.update({"x": [0, 1, 2], "y": [0, 1, 2]}) - expected.coords.update({"x": ("points", [0, 1, 2]), "y": ("points", [0, 1, 2])}) - actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], method="pad") - assert_identical(expected, actual) - - with pytest.raises(KeyError): - data.sel_points(x=[2.5], y=[2.0], method="pad", tolerance=1e-3) - @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sel_fancy(self): data = create_test_data() From 48c680f8e631b0786989356e8360968bef551195 Mon Sep 17 00:00:00 2001 From: Norman Barker Date: Mon, 26 Aug 2019 07:49:58 -0700 Subject: [PATCH 09/43] added support for rasterio geotiff tags (#3249) --- xarray/backends/rasterio_.py | 7 +++++-- xarray/tests/test_backends.py | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 1d832d4f671..316f13470b7 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -322,11 +322,14 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc attrs["units"] = riods.units # Parse extra metadata from tags, if supported - parsers = {"ENVI": _parse_envi} + parsers = {"ENVI": _parse_envi, "GTiff": lambda m: m} driver = riods.driver if driver in parsers: - meta = parsers[driver](riods.tags(ns=driver)) + if driver == "GTiff": + meta = parsers[driver](riods.tags()) + else: + meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e76cb3aecf7..a5c42fd368c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3923,6 +3923,12 @@ def test_ENVI_tags(self): assert isinstance(rioda.attrs["map_info"], str) assert isinstance(rioda.attrs["samples"], str) + def test_geotiff_tags(self): + # Create a geotiff file with some tags + with create_tmp_geotiff() as (tmp_file, _): + with xr.open_rasterio(tmp_file) as rioda: + assert isinstance(rioda.attrs["AREA_OR_POINT"], str) + def test_no_mftime(self): # rasterio can accept "filename" urguments that are actually urls, # including paths to remote files. From e3b3bed2c2e27eb74adc2b7f80c365c2928cd78b Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 26 Aug 2019 14:43:51 -0400 Subject: [PATCH 10/43] Raise on inplace=True (#3260) * raise on inplace=True * remove from docstrings * missed (at least) one * remove from internal methods in dataarray * TypeError * typeerror dataarray * whatsnew * inplace returns no value --- doc/whats-new.rst | 5 ++- xarray/core/dataarray.py | 59 ++++++-------------------- xarray/core/dataset.py | 75 ++++++++-------------------------- xarray/core/utils.py | 16 +++----- xarray/tests/test_dataarray.py | 8 ++-- xarray/tests/test_dataset.py | 24 +++-------- 6 files changed, 48 insertions(+), 139 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7ce60dc6e81..dbd3ebbfe7e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,10 +24,13 @@ This release increases the minimum required Python version from 3.5.0 to 3.5.3 Breaking changes ~~~~~~~~~~~~~~~~ - The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated +- The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated since v0.10.0. These are redundant with the ``isel`` / ``sel`` methods. See :ref:`vectorized_indexing` for the details By `Maximilian Roos `_ +- The ``inplace`` kwarg for public methods now raises an error, having been deprecated + since v0.11.0. + By `Maximilian Roos `_ New functions/methods ~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4bd80553588..e3e543b6621 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -700,30 +700,17 @@ def reset_coords( drop : bool, optional If True, remove coordinates instead of converting them into variables. - inplace : bool, optional - If True, modify this object in place. Otherwise, create a new - object. Returns ------- - Dataset, or DataArray if ``drop == True``, or None if - ``inplace == True`` + Dataset, or DataArray if ``drop == True`` """ - inplace = _check_inplace(inplace) - if inplace and not drop: - raise ValueError( - "cannot reset coordinates in-place on a " - "DataArray without ``drop == True``" - ) + _check_inplace(inplace) if names is None: names = set(self.coords) - set(self.dims) dataset = self.coords.to_dataset().reset_coords(names, drop) if drop: - if inplace: - self._coords = dataset._variables - return None - else: - return self._replace(coords=dataset._variables) + return self._replace(coords=dataset._variables) else: if self.name is None: raise ValueError( @@ -1485,9 +1472,6 @@ def set_index( append : bool, optional If True, append the supplied index(es) to the existing index(es). Otherwise replace the existing index(es) (default). - inplace : bool, optional - If True, set new index(es) in-place. Otherwise, return a new - DataArray object. **indexes_kwargs: optional The keyword arguments form of ``indexes``. One of indexes or indexes_kwargs must be provided. @@ -1496,7 +1480,6 @@ def set_index( ------- obj : DataArray Another DataArray, with this data but replaced coordinates. - Return None if inplace=True. Example ------- @@ -1526,14 +1509,10 @@ def set_index( -------- DataArray.reset_index """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) indexes = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") coords, _ = merge_indexes(indexes, self._coords, set(), append=append) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def reset_index( self, @@ -1551,36 +1530,29 @@ def reset_index( drop : bool, optional If True, remove the specified indexes and/or multi-index levels instead of extracting them as new coordinates (default: False). - inplace : bool, optional - If True, modify the dataarray in-place. Otherwise, return a new - DataArray object. Returns ------- obj : DataArray Another dataarray, with this dataarray's data but replaced - coordinates. If ``inplace == True``, return None. + coordinates. See Also -------- DataArray.set_index """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) coords, _ = split_indexes( dims_or_levels, self._coords, set(), self._level_coords, drop=drop ) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def reorder_levels( self, dim_order: Mapping[Hashable, Sequence[int]] = None, inplace: bool = None, **dim_order_kwargs: Sequence[int] - ) -> Optional["DataArray"]: + ) -> "DataArray": """Rearrange index levels using input order. Parameters @@ -1589,9 +1561,6 @@ def reorder_levels( Mapping from names matching dimensions and values given by lists representing new level orders. Every given dimension must have a multi-index. - inplace : bool, optional - If True, modify the dataarray in-place. Otherwise, return a new - DataArray object. **dim_order_kwargs: optional The keyword arguments form of ``dim_order``. One of dim_order or dim_order_kwargs must be provided. @@ -1600,9 +1569,9 @@ def reorder_levels( ------- obj : DataArray Another dataarray, with this dataarray's data but replaced - coordinates. If ``inplace == True``, return None. + coordinates. """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) dim_order = either_dict_or_kwargs(dim_order, dim_order_kwargs, "reorder_levels") replace_coords = {} for dim, order in dim_order.items(): @@ -1613,11 +1582,7 @@ def reorder_levels( replace_coords[dim] = IndexVariable(coord.dims, index.reorder_levels(order)) coords = self._coords.copy() coords.update(replace_coords) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def stack( self, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4cb5d1454ce..e62b6612ae6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1354,9 +1354,6 @@ def set_coords( ---------- names : hashable or iterable of hashables Name(s) of variables in this dataset to convert into coordinates. - inplace : bool, optional - If True, modify this dataset inplace. Otherwise, create a new - object. Returns ------- @@ -1370,13 +1367,13 @@ def set_coords( # DataFrame.set_index? # nb. check in self._variables, not self.data_vars to insure that the # operation is idempotent - inplace = _check_inplace(inplace) + _check_inplace(inplace) if isinstance(names, str) or not isinstance(names, Iterable): names = [names] else: names = list(names) self._assert_all_in_dataset(names) - obj = self if inplace else self.copy() + obj = self.copy() obj._coord_names.update(names) return obj @@ -1396,15 +1393,12 @@ def reset_coords( drop : bool, optional If True, remove coordinates instead of converting them into variables. - inplace : bool, optional - If True, modify this dataset inplace. Otherwise, create a new - object. Returns ------- Dataset """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) if names is None: names = self._coord_names - set(self.dims) else: @@ -1418,7 +1412,7 @@ def reset_coords( raise ValueError( "cannot remove index coordinates with reset_coords: %s" % bad_coords ) - obj = self if inplace else self.copy() + obj = self.copy() obj._coord_names.difference_update(names) if drop: for name in names: @@ -2397,9 +2391,6 @@ def rename( name_dict : dict-like, optional Dictionary whose keys are current variable or dimension names and whose values are the desired names. - inplace : bool, optional - If True, rename variables and dimensions in-place. Otherwise, - return a new dataset object. **names, optional Keyword form of ``name_dict``. One of name_dict or names must be provided. @@ -2416,7 +2407,7 @@ def rename( Dataset.rename_dims DataArray.rename """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) name_dict = either_dict_or_kwargs(name_dict, names, "rename") for k in name_dict.keys(): if k not in self and k not in self.dims: @@ -2428,9 +2419,7 @@ def rename( variables, coord_names, dims, indexes = self._rename_all( name_dict=name_dict, dims_dict=name_dict ) - return self._replace( - variables, coord_names, dims=dims, indexes=indexes, inplace=inplace - ) + return self._replace(variables, coord_names, dims=dims, indexes=indexes) def rename_dims( self, dims_dict: Mapping[Hashable, Hashable] = None, **dims: Hashable @@ -2520,9 +2509,6 @@ def swap_dims( Dictionary whose keys are current dimension names and whose values are new names. Each value must already be a variable in the dataset. - inplace : bool, optional - If True, swap dimensions in-place. Otherwise, return a new dataset - object. Returns ------- @@ -2537,7 +2523,7 @@ def swap_dims( """ # TODO: deprecate this method in favor of a (less confusing) # rename_dims() method that only renames dimensions. - inplace = _check_inplace(inplace) + _check_inplace(inplace) for k, v in dims_dict.items(): if k not in self.dims: raise ValueError( @@ -2570,9 +2556,7 @@ def swap_dims( var.dims = dims variables[k] = var - return self._replace_with_new_dims( - variables, coord_names, indexes=indexes, inplace=inplace - ) + return self._replace_with_new_dims(variables, coord_names, indexes=indexes) def expand_dims( self, @@ -2746,9 +2730,6 @@ def set_index( append : bool, optional If True, append the supplied index(es) to the existing index(es). Otherwise replace the existing index(es) (default). - inplace : bool, optional - If True, set new index(es) in-place. Otherwise, return a new - Dataset object. **indexes_kwargs: optional The keyword arguments form of ``indexes``. One of indexes or indexes_kwargs must be provided. @@ -2790,14 +2771,12 @@ def set_index( Dataset.reset_index Dataset.swap_dims """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) indexes = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") variables, coord_names = merge_indexes( indexes, self._variables, self._coord_names, append=append ) - return self._replace_vars_and_dims( - variables, coord_names=coord_names, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names=coord_names) def reset_index( self, @@ -2815,9 +2794,6 @@ def reset_index( drop : bool, optional If True, remove the specified indexes and/or multi-index levels instead of extracting them as new coordinates (default: False). - inplace : bool, optional - If True, modify the dataset in-place. Otherwise, return a new - Dataset object. Returns ------- @@ -2828,7 +2804,7 @@ def reset_index( -------- Dataset.set_index """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) variables, coord_names = split_indexes( dims_or_levels, self._variables, @@ -2836,9 +2812,7 @@ def reset_index( cast(Mapping[Hashable, Hashable], self._level_coords), drop=drop, ) - return self._replace_vars_and_dims( - variables, coord_names=coord_names, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names=coord_names) def reorder_levels( self, @@ -2854,9 +2828,6 @@ def reorder_levels( Mapping from names matching dimensions and values given by lists representing new level orders. Every given dimension must have a multi-index. - inplace : bool, optional - If True, modify the dataset in-place. Otherwise, return a new - DataArray object. **dim_order_kwargs: optional The keyword arguments form of ``dim_order``. One of dim_order or dim_order_kwargs must be provided. @@ -2867,7 +2838,7 @@ def reorder_levels( Another dataset, with this dataset's data but replaced coordinates. """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) dim_order = either_dict_or_kwargs(dim_order, dim_order_kwargs, "reorder_levels") variables = self._variables.copy() indexes = OrderedDict(self.indexes) @@ -2880,7 +2851,7 @@ def reorder_levels( variables[dim] = IndexVariable(coord.dims, new_index) indexes[dim] = new_index - return self._replace(variables, indexes=indexes, inplace=inplace) + return self._replace(variables, indexes=indexes) def _stack_once(self, dims, new_dim): variables = OrderedDict() @@ -3176,9 +3147,6 @@ def update(self, other: "DatasetLike", inplace: bool = None) -> "Dataset": - mapping {var name: (dimension name, array-like)} - mapping {var name: (tuple of dimension names, array-like)} - inplace : bool, optional - If True, merge the other dataset into this dataset in-place. - Otherwise, return a new dataset object. Returns ------- @@ -3191,12 +3159,10 @@ def update(self, other: "DatasetLike", inplace: bool = None) -> "Dataset": If any dimensions would have inconsistent sizes in the updated dataset. """ - inplace = _check_inplace(inplace, default=True) + _check_inplace(inplace) variables, coord_names, dims = dataset_update_method(self, other) - return self._replace_vars_and_dims( - variables, coord_names, dims, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names, dims, inplace=True) def merge( self, @@ -3218,9 +3184,6 @@ def merge( ---------- other : Dataset or castable to Dataset Dataset or variables to merge with this dataset. - inplace : bool, optional - If True, merge the other dataset into this dataset in-place. - Otherwise, return a new dataset object. overwrite_vars : Hashable or iterable of Hashable, optional If provided, update variables of these name(s) without checking for conflicts in this dataset. @@ -3257,7 +3220,7 @@ def merge( MergeError If any variables conflict (see ``compat``). """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) variables, coord_names, dims = dataset_merge_method( self, other, @@ -3267,9 +3230,7 @@ def merge( fill_value=fill_value, ) - return self._replace_vars_and_dims( - variables, coord_names, dims, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names, dims) def _assert_all_in_dataset( self, names: Iterable[Hashable], virtual_okay: bool = False diff --git a/xarray/core/utils.py b/xarray/core/utils.py index bf8c9a264e3..a8bb5532f1f 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -36,19 +36,13 @@ T = TypeVar("T") -def _check_inplace(inplace: Optional[bool], default: bool = False) -> bool: - if inplace is None: - inplace = default - else: - warnings.warn( - "The inplace argument has been deprecated and will be " - "removed in a future version of xarray.", - FutureWarning, - stacklevel=3, +def _check_inplace(inplace: Optional[bool]) -> None: + if inplace is not None: + raise TypeError( + "The `inplace` argument has been removed from xarray. " + "You can achieve an identical effect with python's standard assignment." ) - return inplace - def alias_message(old_name: str, new_name: str) -> str: return "%s has been deprecated. Use %s instead." % (old_name, new_name) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 6a68fb73837..798ef66c82e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1291,9 +1291,8 @@ def test_reset_coords(self): ) assert_identical(actual, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): - with raises_regex(ValueError, "cannot reset coord"): - data = data.reset_coords(inplace=True) + with pytest.raises(TypeError): + data = data.reset_coords(inplace=True) with raises_regex(ValueError, "cannot be found"): data.reset_coords("foo", drop=True) with raises_regex(ValueError, "cannot be found"): @@ -1702,10 +1701,9 @@ def test_reorder_levels(self): obj = self.mda.reorder_levels(x=["level_2", "level_1"]) assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): array = self.mda.copy() array.reorder_levels(x=["level_2", "level_1"], inplace=True) - assert_identical(array, expected) array = DataArray([1, 2], dims="x") with pytest.raises(KeyError): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 82ae65d955f..a5d9a65d020 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2325,18 +2325,11 @@ def test_rename_same_name(self): renamed = data.rename(newnames) assert_identical(renamed, data) - @pytest.mark.filterwarnings("ignore:The inplace argument") def test_rename_inplace(self): times = pd.date_range("2000-01-01", periods=3) data = Dataset({"z": ("x", [2, 3, 4]), "t": ("t", times)}) - copied = data.copy() - renamed = data.rename({"x": "y"}) - data.rename({"x": "y"}, inplace=True) - assert_identical(data, renamed) - assert not data.equals(copied) - assert data.dims == {"y": 3, "t": 3} - # check virtual variables - assert_array_equal(data["t.dayofyear"], [1, 2, 3]) + with pytest.raises(TypeError): + data.rename({"x": "y"}, inplace=True) def test_rename_dims(self): original = Dataset({"x": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42}) @@ -2599,7 +2592,7 @@ def test_set_index(self): obj = ds.set_index(x=mindex.names) assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.set_index(x=mindex.names, inplace=True) assert_identical(ds, expected) @@ -2624,9 +2617,8 @@ def test_reset_index(self): obj = ds.reset_index("x") assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.reset_index("x", inplace=True) - assert_identical(ds, expected) def test_reorder_levels(self): ds = create_test_multiindex() @@ -2637,9 +2629,8 @@ def test_reorder_levels(self): reindexed = ds.reorder_levels(x=["level_2", "level_1"]) assert_identical(reindexed, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.reorder_levels(x=["level_2", "level_1"], inplace=True) - assert_identical(ds, expected) ds = Dataset({}, coords={"x": [1, 2]}) with raises_regex(ValueError, "has no MultiIndex"): @@ -2779,11 +2770,8 @@ def test_update(self): assert actual_result is actual assert_identical(expected, actual) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): actual = data.update(data, inplace=False) - expected = data - assert actual is not expected - assert_identical(expected, actual) other = Dataset(attrs={"new": "attr"}) actual = data.copy() From 3c020e58d3bf40101b0f4968f155e4eaa9d70e62 Mon Sep 17 00:00:00 2001 From: Gerardo Rivera Date: Mon, 26 Aug 2019 15:36:35 -0500 Subject: [PATCH 11/43] Initialize empty or full DataArray (#3159) * TST: add test for DataArray init with a single value * ENH: add empty and full DataArray initialization * Update whats-new * Remove ValueError test * Add function to verify and fill array according to coordinates * Use item in numpy array to compare with None * ENH: add empty and full DataArray initialization * Add function to verify and fill array according to coordinates * Handle coords being a list of tuples * Use .shape to identify scalar arrays * Better handling of dims * Remove conditionals over shape value * Ignore 0d arrays * Fill array with NaN when no data given * Add more tests * black * black2 * Type check for ExplicitlyIndexed objects * Change parameter name * Remove Optional * Remove abbreviation * Use as_variable * Pass tuples explicitly to coords in test * Tests for 0d * Move ExplicitlyIndexed check into is_scalar * Update utils.py --- doc/whats-new.rst | 5 +++++ xarray/core/dataarray.py | 21 ++++++++++++++++++++- xarray/core/utils.py | 12 +++++++----- xarray/tests/test_dataarray.py | 26 ++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dbd3ebbfe7e..fcb14a7c29a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -75,6 +75,11 @@ Enhancements - In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if ``append_dim`` is set, as it will automatically be set to ``'a'`` internally. By `David Brochart `_. + +- Added the ability to initialize an empty or full DataArray + with a single value. (:issue:`277`) + By `Gerardo Rivera `_. + - :py:func:`~xarray.Dataset.to_netcdf()` now supports the ``invalid_netcdf`` kwarg when used with ``engine="h5netcdf"``. It is passed to :py:func:`h5netcdf.File`. By `Ulrich Herter `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e3e543b6621..4b8d8acb513 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -158,6 +158,24 @@ def _infer_coords_and_dims( return new_coords, dims +def _check_data_shape(data, coords, dims): + if data is dtypes.NA: + data = np.nan + if coords is not None and utils.is_scalar(data, include_0d=False): + if utils.is_dict_like(coords): + if dims is None: + return data + else: + data_shape = tuple( + as_variable(coords[k], k).size if k in coords.keys() else 1 + for k in dims + ) + else: + data_shape = tuple(as_variable(coord, "foo").size for coord in coords) + data = np.full(data_shape, data) + return data + + class _LocIndexer: def __init__(self, data_array: "DataArray"): self.data_array = data_array @@ -234,7 +252,7 @@ class DataArray(AbstractArray, DataWithCoords): def __init__( self, - data: Any, + data: Any = dtypes.NA, coords: Union[Sequence[Tuple], Mapping[Hashable, Any], None] = None, dims: Union[Hashable, Sequence[Hashable], None] = None, name: Hashable = None, @@ -323,6 +341,7 @@ def __init__( if encoding is None: encoding = getattr(data, "encoding", None) + data = _check_data_shape(data, coords, dims) data = as_compatible_data(data) coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, encoding, fastpath=True) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index a8bb5532f1f..9e0037b4da0 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -29,8 +29,6 @@ import numpy as np import pandas as pd -from .pycompat import dask_array_type - K = TypeVar("K") V = TypeVar("V") T = TypeVar("T") @@ -269,16 +267,20 @@ def either_dict_or_kwargs( return cast(Mapping[Hashable, T], kw_kwargs) -def is_scalar(value: Any) -> bool: +def is_scalar(value: Any, include_0d: bool = True) -> bool: """Whether to treat a value as a scalar. Any non-iterable, string, or 0-D array """ + from .variable import NON_NUMPY_SUPPORTED_ARRAY_TYPES + + if include_0d: + include_0d = getattr(value, "ndim", None) == 0 return ( - getattr(value, "ndim", None) == 0 + include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + dask_array_type) + isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES) or hasattr(value, "__array_function__") ) ) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 798ef66c82e..f623ec9976f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1446,6 +1446,32 @@ def test_rename(self): renamed_kwargs = self.dv.x.rename(x="z").rename("z") assert_identical(renamed, renamed_kwargs) + def test_init_value(self): + expected = DataArray( + np.full((3, 4), 3), dims=["x", "y"], coords=[range(3), range(4)] + ) + actual = DataArray(3, dims=["x", "y"], coords=[range(3), range(4)]) + assert_identical(expected, actual) + + expected = DataArray( + np.full((1, 10, 2), 0), + dims=["w", "x", "y"], + coords={"x": np.arange(10), "y": ["north", "south"]}, + ) + actual = DataArray(0, dims=expected.dims, coords=expected.coords) + assert_identical(expected, actual) + + expected = DataArray( + np.full((10, 2), np.nan), coords=[("x", np.arange(10)), ("y", ["a", "b"])] + ) + actual = DataArray(coords=[("x", np.arange(10)), ("y", ["a", "b"])]) + assert_identical(expected, actual) + + with pytest.raises(KeyError): + DataArray(np.array(1), coords={"x": np.arange(10)}, dims=["x"]) + with raises_regex(ValueError, "does not match the 0 dim"): + DataArray(np.array(1), coords=[("x", np.arange(10))]) + def test_swap_dims(self): array = DataArray(np.random.randn(3), {"y": ("x", list("abc"))}, "x") expected = DataArray(array.values, {"y": list("abc")}, dims="y") From 851f763aedd09f9cbaa163aec693f864ed9b5efe Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 26 Aug 2019 18:00:43 -0600 Subject: [PATCH 12/43] Sparse fixes for reindex (#3255) * Fix sparse warnings. * Fix reindex with sparse. * remove unnecessary todense * refactor. * pep8 * whitespace. * Fix tests. * Fix docstring. * Review comments. * Fix test. * Remove some xfail marks. * whats-new * Better assertion func. * silence warning. --- doc/whats-new.rst | 4 +- xarray/core/indexing.py | 50 +++++++----- xarray/core/variable.py | 3 +- xarray/tests/test_indexing.py | 10 ++- xarray/tests/test_sparse.py | 138 ++++++++++------------------------ 5 files changed, 80 insertions(+), 125 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fcb14a7c29a..34f002086da 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,8 +39,8 @@ New functions/methods `NEP18 `_ compliant numpy-like library (important: read notes about NUMPY_EXPERIMENTAL_ARRAY_FUNCTION in the above link). Added explicit test coverage for - `sparse `_. (:issue:`3117`, :issue:`3202`) - By `Nezar Abdennur `_ + `sparse `_. (:issue:`3117`, :issue:`3202`). + This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ and `Guido Imperiale `_. - The xarray package is now discoverable by mypy (although typing hints coverage is not diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d5cd5eb9e8f..f6570149484 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -10,7 +10,7 @@ from . import duck_array_ops, nputils, utils from .npcompat import DTypeLike -from .pycompat import dask_array_type, integer_types +from .pycompat import dask_array_type, integer_types, sparse_array_type from .utils import is_dict_like, maybe_cast_to_coords_dtype @@ -1076,19 +1076,30 @@ def _logical_any(args): return functools.reduce(operator.or_, args) -def _masked_result_drop_slice(key, chunks_hint=None): +def _masked_result_drop_slice(key, data=None): + key = (k for k in key if not isinstance(k, slice)) - if chunks_hint is not None: - key = [ - _dask_array_with_chunks_hint(k, chunks_hint) - if isinstance(k, np.ndarray) - else k - for k in key - ] - return _logical_any(k == -1 for k in key) + chunks_hint = getattr(data, "chunks", None) + + new_keys = [] + for k in key: + if isinstance(k, np.ndarray): + if isinstance(data, dask_array_type): + new_keys.append(_dask_array_with_chunks_hint(k, chunks_hint)) + elif isinstance(data, sparse_array_type): + import sparse + + new_keys.append(sparse.COO.from_numpy(k)) + else: + new_keys.append(k) + else: + new_keys.append(k) + + mask = _logical_any(k == -1 for k in new_keys) + return mask -def create_mask(indexer, shape, chunks_hint=None): +def create_mask(indexer, shape, data=None): """Create a mask for indexing with a fill-value. Parameters @@ -1098,25 +1109,24 @@ def create_mask(indexer, shape, chunks_hint=None): the result that should be masked. shape : tuple Shape of the array being indexed. - chunks_hint : tuple, optional - Optional tuple indicating desired chunks for the result. If provided, - used as a hint for chunks on the resulting dask. Must have a hint for - each dimension on the result array. + data : optional + Data for which mask is being created. If data is a dask arrays, its chunks + are used as a hint for chunks on the resulting mask. If data is a sparse + array, the returned mask is also a sparse array. Returns ------- - mask : bool, np.ndarray or dask.array.Array with dtype=bool - Dask array if chunks_hint is provided, otherwise a NumPy array. Has the - same shape as the indexing result. + mask : bool, np.ndarray, SparseArray or dask.array.Array with dtype=bool + Same type as data. Has the same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): key = _outer_to_vectorized_indexer(indexer, shape).tuple assert not any(isinstance(k, slice) for k in key) - mask = _masked_result_drop_slice(key, chunks_hint) + mask = _masked_result_drop_slice(key, data) elif isinstance(indexer, VectorizedIndexer): key = indexer.tuple - base_mask = _masked_result_drop_slice(key, chunks_hint) + base_mask = _masked_result_drop_slice(key, data) slice_shape = tuple( np.arange(*k.indices(size)).size for k, size in zip(key, shape) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bc8da10dd0c..c64dd8af6c6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -710,8 +710,7 @@ def _getitem_with_mask(self, key, fill_value=dtypes.NA): actual_indexer = indexer data = as_indexable(self._data)[actual_indexer] - chunks_hint = getattr(data, "chunks", None) - mask = indexing.create_mask(indexer, self.shape, chunks_hint) + mask = indexing.create_mask(indexer, self.shape, data) data = duck_array_ops.where(mask, fill_value, data) else: # array cannot be indexed along dimensions of size 0, so just diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index f37f8d98ca8..82ee9b63f9d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -708,7 +708,9 @@ def test_create_mask_dask(): indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]))) expected = np.array(2 * [[False, True, False]]) - actual = indexing.create_mask(indexer, (5, 5, 5), chunks_hint=((1, 1), (2, 1))) + actual = indexing.create_mask( + indexer, (5, 5, 5), da.empty((2, 3), chunks=((1, 1), (2, 1))) + ) assert actual.chunks == ((1, 1), (2, 1)) np.testing.assert_array_equal(expected, actual) @@ -716,12 +718,14 @@ def test_create_mask_dask(): (np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])) ) expected = np.array([[False, True, True]] * 2).T - actual = indexing.create_mask(indexer, (5, 2), chunks_hint=((3,), (2,))) + actual = indexing.create_mask( + indexer, (5, 2), da.empty((3, 2), chunks=((3,), (2,))) + ) assert isinstance(actual, da.Array) np.testing.assert_array_equal(expected, actual) with pytest.raises(ValueError): - indexing.create_mask(indexer, (5, 2), chunks_hint=()) + indexing.create_mask(indexer, (5, 2), da.empty((5,), chunks=(1,))) def test_create_mask_error(): diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 36decf49713..80f80a93a1c 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -9,6 +9,7 @@ import xarray.ufuncs as xu from xarray import DataArray, Variable from xarray.core.npcompat import IS_NEP18_ACTIVE +from xarray.core.pycompat import sparse_array_type from . import assert_equal, assert_identical @@ -23,6 +24,12 @@ sparse = pytest.importorskip("sparse") +def assert_sparse_equal(a, b): + assert isinstance(a, sparse_array_type) + assert isinstance(b, sparse_array_type) + np.testing.assert_equal(a.todense(), b.todense()) + + def make_ndarray(shape): return np.arange(np.prod(shape)).reshape(shape) @@ -105,21 +112,9 @@ def test_variable_property(prop): (do("to_base_variable"), True), (do("transpose"), True), (do("unstack", dimensions={"x": {"x1": 5, "x2": 2}}), True), - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("identical", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), + (do("identical", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -161,11 +156,7 @@ def test_variable_property(prop): True, marks=xfail(reason="Missing implementation for np.nancumsum"), ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("fillna", 0), True), param( do("item", (1, 1)), False, @@ -188,11 +179,7 @@ def test_variable_property(prop): True, # noqa marks=xfail(reason="Missing implementation for np.pad"), ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("prod"), False), param( do("quantile", q=0.5), True, @@ -219,20 +206,12 @@ def test_variable_property(prop): param( do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") ), - param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("sum"), False), param( do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") ), param(do("to_dict"), False, marks=xfail(reason="Coercion to dense")), - param( - do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), - True, - marks=xfail(reason="Coercion of dense to sparse when using sparse mask"), - ), # noqa + (do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), True), ], ids=repr, ) @@ -282,16 +261,18 @@ def setUp(self): self.var = xr.Variable(("x", "y"), self.data) def test_unary_op(self): - sparse.utils.assert_eq(-self.var.data, -self.data) - sparse.utils.assert_eq(abs(self.var).data, abs(self.data)) - sparse.utils.assert_eq(self.var.round().data, self.data.round()) + assert_sparse_equal(-self.var.data, -self.data) + assert_sparse_equal(abs(self.var).data, abs(self.data)) + assert_sparse_equal(self.var.round().data, self.data.round()) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_univariate_ufunc(self): - sparse.utils.assert_eq(np.sin(self.data), xu.sin(self.var).data) + assert_sparse_equal(np.sin(self.data), xu.sin(self.var).data) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_bivariate_ufunc(self): - sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) - sparse.utils.assert_eq(np.maximum(self.data, 0), xu.maximum(0, self.var).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(0, self.var).data) def test_repr(self): expected = dedent( @@ -304,9 +285,8 @@ def test_repr(self): def test_pickle(self): v1 = self.var v2 = pickle.loads(pickle.dumps(v1)) - sparse.utils.assert_eq(v1.data, v2.data) + assert_sparse_equal(v1.data, v2.data) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_missing_values(self): a = np.array([0, 1, np.nan, 3]) s = sparse.COO.from_numpy(a) @@ -384,16 +364,8 @@ def test_dataarray_property(prop): # TODO # set_index # swap_dims - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -414,11 +386,7 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.flip"), ), - param( - do("combine_first", make_xrarray({"x": 10, "y": 5})), - True, - marks=xfail(reason="mixed sparse-dense operation"), - ), + (do("combine_first", make_xrarray({"x": 10, "y": 5})), True), param( do("conjugate"), False, @@ -445,16 +413,8 @@ def test_dataarray_property(prop): marks=xfail(reason="Missing implementation for np.einsum"), ), param(do("dropna", "x"), False, marks=xfail(reason="Coercion to dense")), - param( - do("ffill", "x"), - False, - marks=xfail(reason="Coercion to dense via bottleneck.push"), - ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + param(do("ffill", "x"), False, marks=xfail(reason="Coercion to dense")), + (do("fillna", 0), True), param( do("interp", coords={"x": np.arange(10) + 0.5}), True, @@ -489,17 +449,9 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.nanmedian"), ), - param(do("notnull"), True), - param( - do("pipe", np.sum, axis=1), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("notnull"), True), + (do("pipe", np.sum, axis=1), True), + (do("prod"), False), param( do("quantile", q=0.5), False, @@ -526,11 +478,7 @@ def test_dataarray_property(prop): True, marks=xfail(reason="Indexing COO with more than one iterable index"), ), # noqa - param( - do("roll", x=2), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("roll", x=2, roll_coords=True), True), param( do("sel", x=[0, 1, 2], y=[2, 3]), True, @@ -539,11 +487,7 @@ def test_dataarray_property(prop): param( do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") ), - param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("sum"), False), param( do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") ), @@ -606,7 +550,6 @@ def setUp(self): self.ds_ar, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_to_dataset_roundtrip(self): x = self.sp_xr assert_equal(x, x.to_dataset("x").to_array("x")) @@ -657,7 +600,6 @@ def test_align_2d(self): assert np.all(B1.coords["x"] == B2.coords["x"]) assert np.all(B1.coords["y"] == B2.coords["y"]) - @pytest.mark.xfail(reason="fill value leads to sparse-dense operation") def test_align_outer(self): a1 = xr.DataArray( sparse.COO.from_numpy(np.arange(4)), @@ -672,22 +614,21 @@ def test_align_outer(self): a2, b2 = xr.align(a1, b1, join="outer") assert isinstance(a2.data, sparse.SparseArray) assert isinstance(b2.data, sparse.SparseArray) - assert np.all(a2.coords["x"].data == ["a", "b", "c", "d"]) - assert np.all(b2.coords["x"].data == ["a", "b", "c", "d"]) + assert np.all(a2.coords["x"].data == ["a", "b", "c", "d", "e"]) + assert np.all(b2.coords["x"].data == ["a", "b", "c", "d", "e"]) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_concat(self): ds1 = xr.Dataset(data_vars={"d": self.sp_xr}) ds2 = xr.Dataset(data_vars={"d": self.sp_xr}) ds3 = xr.Dataset(data_vars={"d": self.sp_xr}) out = xr.concat([ds1, ds2, ds3], dim="x") - sparse.utils.assert_eq( + assert_sparse_equal( out["d"].data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=0), ) out = xr.concat([self.sp_xr, self.sp_xr, self.sp_xr], dim="y") - sparse.utils.assert_eq( + assert_sparse_equal( out.data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=1) ) @@ -706,6 +647,7 @@ def test_stack(self): roundtripped = stacked.unstack() assert arr.identical(roundtripped) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_ufuncs(self): x = self.sp_xr assert_equal(np.sin(x), xu.sin(x)) @@ -830,8 +772,8 @@ def test_groupby_first(self): def test_groupby_bins(self): x1 = self.ds_xr x2 = self.sp_xr - m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum() - m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum() + m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) + m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) assert isinstance(m2.data, sparse.SparseArray) assert np.allclose(m1.data, m2.data.todense()) From 3f4b0250e386f08233f4e11dc1cd4b12cfa953b0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 27 Aug 2019 02:54:25 -0600 Subject: [PATCH 13/43] sparse=True option for from_dataframe and from_series (#3210) sparse=True option for from_dataframe and from_series Fixes https://github.com/pydata/xarray/issues/3206 --- doc/whats-new.rst | 6 ++ xarray/core/dataarray.py | 28 +++++--- xarray/core/dataset.py | 101 +++++++++++++++++++++++----- xarray/tests/__init__.py | 1 + xarray/tests/test_dataarray.py | 14 ++++ xarray/tests/test_dataset.py | 23 +++++++ xarray/tests/test_duck_array_ops.py | 4 +- 7 files changed, 148 insertions(+), 29 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 34f002086da..75a273bfdb4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,6 +43,12 @@ New functions/methods This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ and `Guido Imperiale `_. +- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now + support ``sparse=True`` for converting pandas objects into xarray objects + wrapping sparse arrays. This is particularly useful with sparsely populated + hierarchical indexes. (:issue:`3206`) + By `Stephan Hoyer `_. + - The xarray package is now discoverable by mypy (although typing hints coverage is not complete yet). mypy type checking is now enforced by CI. Libraries that depend on xarray and use mypy can now remove from their setup.cfg the lines:: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4b8d8acb513..4f78ae7d021 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -733,7 +733,7 @@ def reset_coords( else: if self.name is None: raise ValueError( - "cannot reset_coords with drop=False " "on an unnamed DataArrray" + "cannot reset_coords with drop=False on an unnamed DataArrray" ) dataset[self.name] = self.variable return dataset @@ -1448,9 +1448,7 @@ def expand_dims( This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise TypeError( - "dim should be hashable or sequence/mapping of " "hashables" - ) + raise TypeError("dim should be hashable or sequence/mapping of hashables") elif isinstance(dim, Sequence) and not isinstance(dim, str): if len(dim) != len(set(dim)): raise ValueError("dims should not contain duplicate values.") @@ -2277,19 +2275,27 @@ def from_dict(cls, d: dict) -> "DataArray": return obj @classmethod - def from_series(cls, series: pd.Series) -> "DataArray": + def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray": """Convert a pandas.Series into an xarray.DataArray. If the series's index is a MultiIndex, it will be expanded into a tensor product of one-dimensional coordinates (filling in missing values with NaN). Thus this operation should be the inverse of the `to_series` method. + + If sparse=True, creates a sparse array instead of a dense NumPy array. + Requires the pydata/sparse package. + + See also + -------- + xarray.Dataset.from_dataframe """ - # TODO: add a 'name' parameter - name = series.name - df = pd.DataFrame({name: series}) - ds = Dataset.from_dataframe(df) - return ds[name] + temp_name = "__temporary_name" + df = pd.DataFrame({temp_name: series}) + ds = Dataset.from_dataframe(df, sparse=sparse) + result = cast(DataArray, ds[temp_name]) + result.name = series.name + return result def to_cdms2(self) -> "cdms2_Variable": """Convert this array into a cdms2.Variable @@ -2704,7 +2710,7 @@ def dot( """ if isinstance(other, Dataset): raise NotImplementedError( - "dot products are not yet supported " "with Dataset objects." + "dot products are not yet supported with Dataset objects." ) if not isinstance(other, DataArray): raise TypeError("dot only operates on DataArrays.") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e62b6612ae6..14237a244fd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1214,12 +1214,13 @@ def loc(self) -> _LocIndexer: """ return _LocIndexer(self) - def __getitem__(self, key: object) -> "Union[DataArray, Dataset]": + def __getitem__(self, key: Any) -> "Union[DataArray, Dataset]": """Access variables or coordinates this dataset as a :py:class:`~xarray.DataArray`. Indexing with a list of names will return a new ``Dataset`` object. """ + # TODO(shoyer): type this properly: https://github.com/python/mypy/issues/7328 if utils.is_dict_like(key): return self.isel(**cast(Mapping, key)) @@ -3916,8 +3917,61 @@ def to_dataframe(self): """ return self._to_dataframe(self.dims) + def _set_sparse_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + from sparse import COO + + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + try: + codes = idx.codes + except AttributeError: + # deprecated since pandas 0.24 + codes = idx.labels + coords = np.stack([np.asarray(code) for code in codes], axis=0) + is_sorted = idx.is_lexsorted + else: + coords = np.arange(idx.size).reshape(1, -1) + is_sorted = True + + for name, series in dataframe.items(): + # Cast to a NumPy array first, in case the Series is a pandas + # Extension array (which doesn't have a valid NumPy dtype) + values = np.asarray(series) + + # In virtually all real use cases, the sparse array will now have + # missing values and needs a fill_value. For consistency, don't + # special case the rare exceptions (e.g., dtype=int without a + # MultiIndex). + dtype, fill_value = dtypes.maybe_promote(values.dtype) + values = np.asarray(values, dtype=dtype) + + data = COO( + coords, + values, + shape, + has_duplicates=False, + sorted=is_sorted, + fill_value=fill_value, + ) + self[name] = (dims, data) + + def _set_numpy_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + # expand the DataFrame to include the product of all levels + full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) + dataframe = dataframe.reindex(full_idx) + + for name, series in dataframe.items(): + data = np.asarray(series).reshape(shape) + self[name] = (dims, data) + @classmethod - def from_dataframe(cls, dataframe): + def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset": """Convert a pandas.DataFrame into an xarray.Dataset Each column will be converted into an independent variable in the @@ -3926,7 +3980,24 @@ def from_dataframe(cls, dataframe): values with NaN). This method will produce a Dataset very similar to that on which the 'to_dataframe' method was called, except with possibly redundant dimensions (since all dataset variables will have - the same dimensionality). + the same dimensionality) + + Parameters + ---------- + dataframe : pandas.DataFrame + DataFrame from which to copy data and indices. + sparse : bool + If true, create a sparse arrays instead of dense numpy arrays. This + can potentially save a large amount of memory if the DataFrame has + a MultiIndex. Requires the sparse package (sparse.pydata.org). + + Returns + ------- + New Dataset. + + See also + -------- + xarray.DataArray.from_series """ # TODO: Add an option to remove dimensions along which the variables # are constant, to enable consistent serialization to/from a dataframe, @@ -3939,25 +4010,23 @@ def from_dataframe(cls, dataframe): obj = cls() if isinstance(idx, pd.MultiIndex): - # it's a multi-index - # expand the DataFrame to include the product of all levels - full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) - dataframe = dataframe.reindex(full_idx) - dims = [ + dims = tuple( name if name is not None else "level_%i" % n for n, name in enumerate(idx.names) - ] + ) for dim, lev in zip(dims, idx.levels): obj[dim] = (dim, lev) - shape = [lev.size for lev in idx.levels] + shape = tuple(lev.size for lev in idx.levels) else: - dims = (idx.name if idx.name is not None else "index",) - obj[dims[0]] = (dims, idx) - shape = -1 + index_name = idx.name if idx.name is not None else "index" + dims = (index_name,) + obj[index_name] = (dims, idx) + shape = (idx.size,) - for name, series in dataframe.items(): - data = np.asarray(series).reshape(shape) - obj[name] = (dims, data) + if sparse: + obj._set_sparse_data_from_dataframe(dataframe, dims, shape) + else: + obj._set_numpy_data_from_dataframe(dataframe, dims, shape) return obj def to_dask_dataframe(self, dim_order=None, set_index=False): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fb4f8200e08..ab1d2714b9d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -84,6 +84,7 @@ def LooseVersion(vstring): has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") +has_sparse, requires_sparse = _importorskip("sparse") # some special cases has_h5netcdf07, requires_h5netcdf07 = _importorskip("h5netcdf", minversion="0.7") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index f623ec9976f..532cc32376a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -29,6 +29,7 @@ requires_np113, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -3398,6 +3399,19 @@ def test_to_and_from_series(self): expected_da = self.dv.rename(None) assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"])) + @requires_sparse + def test_from_series_sparse(self): + import sparse + + series = pd.Series([1, 2], index=[("a", 1), ("b", 2)]) + + actual_sparse = DataArray.from_series(series, sparse=True) + actual_dense = DataArray.from_series(series, sparse=False) + + assert isinstance(actual_sparse.data, sparse.COO) + actual_sparse.data = actual_sparse.data.todense() + assert_identical(actual_sparse, actual_dense) + def test_to_and_from_empty_series(self): # GH697 expected = pd.Series([]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a5d9a65d020..3c2b9b6ce8f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -46,6 +46,7 @@ requires_dask, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -3653,6 +3654,28 @@ def test_to_and_from_dataframe(self): expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) + @requires_sparse + def test_from_dataframe_sparse(self): + import sparse + + df_base = pd.DataFrame( + {"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)} + ) + + ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False) + assert isinstance(ds_sparse["y"].data, sparse.COO) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["y"].data = ds_sparse["y"].data.todense() + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + + ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + def test_to_and_from_empty_dataframe(self): # GH697 expected = pd.DataFrame({"foo": []}) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 725cfe3d506..766a391b57f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -245,9 +245,9 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): def from_series_or_scalar(se): - try: + if isinstance(se, pd.Series): return DataArray.from_series(se) - except AttributeError: # scalar case + else: # scalar case return DataArray(se) From 8969b5a974723dc13ce93db1e878f791764f539e Mon Sep 17 00:00:00 2001 From: Gregory Gundersen Date: Wed, 28 Aug 2019 01:02:01 +0100 Subject: [PATCH 14/43] Issue 3227: Remove unwanted files created by ipython examples. (#3270) --- doc/dask.rst | 13 +++++++------ doc/io.rst | 7 +++++++ doc/weather-climate.rst | 6 ++++++ doc/whats-new.rst | 4 ++++ 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/doc/dask.rst b/doc/dask.rst index b0ffd0c449d..adf0a6bf585 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -132,6 +132,13 @@ A dataset can also be converted to a Dask DataFrame using :py:meth:`~xarray.Data Dask DataFrames do not support multi-indexes so the coordinate variables from the dataset are included as columns in the Dask DataFrame. +.. ipython:: python + :suppress: + + import os + os.remove('example-data.nc') + os.remove('manipulated-example-data.nc') + Using Dask with xarray ---------------------- @@ -373,12 +380,6 @@ one million elements (e.g., a 1000x1000 matrix). With large arrays (10+ GB), the cost of queueing up Dask operations can be noticeable, and you may need even larger chunksizes. -.. ipython:: python - :suppress: - - import os - os.remove('example-data.nc') - Optimization Tips ----------------- diff --git a/doc/io.rst b/doc/io.rst index 4a61b59ac2a..f7ac8c095b9 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -743,6 +743,13 @@ be done directly from zarr, as described in the .. _io.cfgrib: +.. ipython:: python + :suppress: + + import shutil + shutil.rmtree('foo.zarr') + shutil.rmtree('path/to/directory.zarr') + GRIB format via cfgrib ---------------------- diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index a17ecd2f2a4..96641c2b97e 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -137,6 +137,12 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.to_netcdf('example-no-leap.nc') xr.open_dataset('example-no-leap.nc') +.. ipython:: python + :suppress: + + import os + os.remove('example-no-leap.nc') + - And resampling along the time dimension for data indexed by a :py:class:`~xarray.CFTimeIndex`: .. ipython:: python diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 75a273bfdb4..22971bb9955 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -145,6 +145,10 @@ Documentation or pushing new commits. By `Gregory Gundersen `_. +- Fixed documentation to clean up unwanted files created in ``ipython`` examples + (:issue:`3227`). + By `Gregory Gundersen `_. + v0.12.3 (10 July 2019) ---------------------- From aaeea6250b89e3605ee1d1a160ad50d6ed657c7e Mon Sep 17 00:00:00 2001 From: Aidan Heerdegen Date: Wed, 28 Aug 2019 16:45:34 +1000 Subject: [PATCH 15/43] BUG: Fixes GH3215 (#3220) * BUG: Fixes GH3215 Explicit cast to numpy array to avoid np.ravel calling out to dask * Added test * Added assertion to test * Removed assertion. Didn't work * PEP8 and flake fixes * Ran black * Implement suggested fixes * Added assert test * Added valid assert_identical test * add requires_dask --- xarray/core/formatting.py | 2 +- xarray/tests/test_conventions.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 51664fb3e32..c6b2537c958 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -96,7 +96,7 @@ def last_item(array): return [] indexer = (slice(-1, None),) * array.ndim - return np.ravel(array[indexer]).tolist() + return np.ravel(np.asarray(array[indexer])).tolist() def format_timestamp(t): diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 36c1d845f8e..5d80abb4661 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -278,6 +278,26 @@ def test_decode_cf_with_dask(self): ) assert_identical(decoded, conventions.decode_cf(original).compute()) + @requires_dask + def test_decode_dask_times(self): + original = Dataset.from_dict( + { + "coords": {}, + "dims": {"time": 5}, + "data_vars": { + "average_T1": { + "dims": ("time",), + "attrs": {"units": "days since 1958-01-01 00:00:00"}, + "data": [87659.0, 88024.0, 88389.0, 88754.0, 89119.0], + } + }, + } + ) + assert_identical( + conventions.decode_cf(original.chunk()), + conventions.decode_cf(original).chunk(), + ) + class CFEncodedInMemoryStore(WritableCFDataStore, InMemoryDataStore): def encode_variable(self, var): From f8647183c49c806896a2c94d12c5df56cb7f389e Mon Sep 17 00:00:00 2001 From: Gerardo Rivera Date: Thu, 29 Aug 2019 12:08:59 -0500 Subject: [PATCH 16/43] Raise proper error for scalar array when coords is a dict (#3271) * Dims and shape check when coords is a dict * Update test to the correct raise message * Move logic and change raise message --- xarray/core/dataarray.py | 5 +++++ xarray/tests/test_dataarray.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4f78ae7d021..4858e4d0e91 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -113,6 +113,11 @@ def _infer_coords_and_dims( coord = as_variable(coord, name=dims[n]).to_index_variable() dims[n] = coord.name dims = tuple(dims) + elif len(dims) != len(shape): + raise ValueError( + "different number of dimensions on data " + "and dims: %s vs %s" % (len(shape), len(dims)) + ) else: for d in dims: if not isinstance(d, str): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 532cc32376a..5ab05b94177 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1468,7 +1468,7 @@ def test_init_value(self): actual = DataArray(coords=[("x", np.arange(10)), ("y", ["a", "b"])]) assert_identical(expected, actual) - with pytest.raises(KeyError): + with raises_regex(ValueError, "different number of dim"): DataArray(np.array(1), coords={"x": np.arange(10)}, dims=["x"]) with raises_regex(ValueError, "does not match the 0 dim"): DataArray(np.array(1), coords=[("x", np.arange(10))]) From 41fecd8658ba50ddda0a52e04c21cec5e53415ac Mon Sep 17 00:00:00 2001 From: crusaderky Date: Thu, 29 Aug 2019 19:14:19 +0200 Subject: [PATCH 17/43] __slots__ (#3250) * Add __slots__ to most classes * Enforced __slots__for all classes; remove _initialized * Speed up __setattr__ * Fix accessors * DeprecationWarning -> FutureWarning * IndexingSupport enum * What's New * Unit tests * Trivial docstrings and comments tweak * Don't expose accessors in Dataset._replace() --- doc/whats-new.rst | 18 ++++++ xarray/backends/api.py | 2 + xarray/backends/common.py | 10 ++++ xarray/backends/netCDF4_.py | 18 ++++++ xarray/backends/zarr.py | 11 ++++ xarray/conventions.py | 4 ++ xarray/core/accessor_str.py | 2 + xarray/core/arithmetic.py | 2 + xarray/core/common.py | 89 ++++++++++++++++++++++----- xarray/core/computation.py | 8 +++ xarray/core/coordinates.py | 14 +++-- xarray/core/dataarray.py | 7 ++- xarray/core/dataset.py | 19 +++++- xarray/core/extensions.py | 15 +++-- xarray/core/groupby.py | 30 +++++++++- xarray/core/indexes.py | 2 + xarray/core/indexing.py | 106 ++++++++++++++++++++++++--------- xarray/core/rolling.py | 26 +++++--- xarray/core/utils.py | 12 +++- xarray/core/variable.py | 4 ++ xarray/plot/facetgrid.py | 1 - xarray/plot/plot.py | 2 + xarray/testing.py | 3 - xarray/tests/test_dataarray.py | 22 +++++++ xarray/tests/test_dataset.py | 22 +++++++ 25 files changed, 373 insertions(+), 76 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 22971bb9955..8796c79da4c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,6 +31,24 @@ Breaking changes - The ``inplace`` kwarg for public methods now raises an error, having been deprecated since v0.11.0. By `Maximilian Roos `_ +- Most xarray objects now define ``__slots__``. This reduces overall RAM usage by ~22% + (not counting the underlying numpy buffers); on CPython 3.7/x64, a trivial DataArray + has gone down from 1.9kB to 1.5kB. + + Caveats: + + - Pickle streams produced by older versions of xarray can't be loaded using this + release, and vice versa. + - Any user code that was accessing the ``__dict__`` attribute of + xarray objects will break. The best practice to attach custom metadata to xarray + objects is to use the ``attrs`` dictionary. + - Any user code that defines custom subclasses of xarray classes must now explicitly + define ``__slots__`` itself. Subclasses that don't add any attributes must state so + by defining ``__slots__ = ()`` right after the class header. + Omitting ``__slots__`` will now cause a ``FutureWarning`` to be logged, and a hard + crash in a later release. + + (:issue:`3250`) by `Guido Imperiale `_. New functions/methods ~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9ad1db1829b..a20d3c2a306 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -694,6 +694,8 @@ def open_dataarray( class _MultiFileCloser: + __slots__ = ("file_objs",) + def __init__(self, file_objs): self.file_objs = file_objs diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 7ee11052192..455b77907f9 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -68,12 +68,16 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): + __slots__ = () + def __array__(self, dtype=None): key = indexing.BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) class AbstractDataStore(Mapping): + __slots__ = () + def __iter__(self): return iter(self.variables) @@ -165,6 +169,8 @@ def __exit__(self, exception_type, exception_value, traceback): class ArrayWriter: + __slots__ = ("sources", "targets", "regions", "lock") + def __init__(self, lock=None): self.sources = [] self.targets = [] @@ -205,6 +211,8 @@ def sync(self, compute=True): class AbstractWritableDataStore(AbstractDataStore): + __slots__ = () + def encode(self, variables, attributes): """ Encode the variables and attributes in this store @@ -371,6 +379,8 @@ def set_dimensions(self, variables, unlimited_dims=None): class WritableCFDataStore(AbstractWritableDataStore): + __slots__ = () + def encode(self, variables, attributes): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 57317a7a1a5..813942c2f32 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -30,6 +30,8 @@ class BaseNetCDF4Array(BackendArray): + __slots__ = ("datastore", "dtype", "shape", "variable_name") + def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -52,8 +54,13 @@ def __setitem__(self, key, value): if self.datastore.autoclose: self.datastore.close(needs_lock=False) + def get_array(self, needs_lock=True): + raise NotImplementedError("Virtual Method") + class NetCDF4ArrayWrapper(BaseNetCDF4Array): + __slots__ = () + def get_array(self, needs_lock=True): ds = self.datastore._acquire(needs_lock) variable = ds.variables[self.variable_name] @@ -294,6 +301,17 @@ class NetCDF4DataStore(WritableCFDataStore): This store supports NetCDF3, NetCDF4 and OpenDAP datasets. """ + __slots__ = ( + "autoclose", + "format", + "is_remote", + "lock", + "_filename", + "_group", + "_manager", + "_mode", + ) + def __init__( self, manager, group=None, mode=None, lock=NETCDF4_PYTHON_LOCK, autoclose=False ): diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 31997d258c8..9a115de55ef 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -29,6 +29,8 @@ def _encode_zarr_attr_value(value): class ZarrArrayWrapper(BackendArray): + __slots__ = ("datastore", "dtype", "shape", "variable_name") + def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -231,6 +233,15 @@ class ZarrStore(AbstractWritableDataStore): """Store for reading and writing data via zarr """ + __slots__ = ( + "append_dim", + "ds", + "_consolidate_on_close", + "_group", + "_read_only", + "_synchronizer", + ) + @classmethod def open_group( cls, diff --git a/xarray/conventions.py b/xarray/conventions.py index c15e5c40e73..1e40d254e96 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -31,6 +31,8 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): dtype('int16') """ + __slots__ = ("array",) + def __init__(self, array): self.array = indexing.as_indexable(array) @@ -60,6 +62,8 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): dtype('bool') """ + __slots__ = ("array",) + def __init__(self, array): self.array = indexing.as_indexable(array) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 03a6d37b01e..8838e71e6ca 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -75,6 +75,8 @@ class StringAccessor: """ + __slots__ = ("_obj",) + def __init__(self, obj): self._obj = obj diff --git a/xarray/core/arithmetic.py b/xarray/core/arithmetic.py index 5e8c8758ef5..137db034c95 100644 --- a/xarray/core/arithmetic.py +++ b/xarray/core/arithmetic.py @@ -14,6 +14,8 @@ class SupportsArithmetic: Used by Dataset, DataArray, Variable and GroupBy. """ + __slots__ = () + # TODO: implement special methods for arithmetic here rather than injecting # them in xarray/core/ops.py. Ideally, do so by inheriting from # numpy.lib.mixins.NDArrayOperatorsMixin. diff --git a/xarray/core/common.py b/xarray/core/common.py index 2e834492521..ab9e7616ce1 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1,3 +1,4 @@ +import warnings from collections import OrderedDict from contextlib import suppress from textwrap import dedent @@ -35,6 +36,8 @@ class ImplementsArrayReduce: + __slots__ = () + @classmethod def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): if include_skipna: @@ -72,6 +75,8 @@ def wrapped_func(self, dim=None, axis=None, **kwargs): # type: ignore class ImplementsDatasetReduce: + __slots__ = () + @classmethod def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): if include_skipna: @@ -110,6 +115,8 @@ class AbstractArray(ImplementsArrayReduce): """Shared base class for DataArray and Variable. """ + __slots__ = () + def __bool__(self: Any) -> bool: return bool(self.values) @@ -180,7 +187,25 @@ class AttrAccessMixin: """Mixin class that allows getting keys with attribute access """ - _initialized = False + __slots__ = () + + def __init_subclass__(cls): + """Verify that all subclasses explicitly define ``__slots__``. If they don't, + raise error in the core xarray module and a FutureWarning in third-party + extensions. + This check is only triggered in Python 3.6+. + """ + if not hasattr(object.__new__(cls), "__dict__"): + cls.__setattr__ = cls._setattr_slots + elif cls.__module__.startswith("xarray."): + raise AttributeError("%s must explicitly define __slots__" % cls.__name__) + else: + cls.__setattr__ = cls._setattr_dict + warnings.warn( + "xarray subclass %s should explicitly define __slots__" % cls.__name__, + FutureWarning, + stacklevel=2, + ) @property def _attr_sources(self) -> List[Mapping[Hashable, Any]]: @@ -195,7 +220,7 @@ def _item_sources(self) -> List[Mapping[Hashable, Any]]: return [] def __getattr__(self, name: str) -> Any: - if name != "__setstate__": + if name not in {"__dict__", "__setstate__"}: # this avoids an infinite loop when pickle looks for the # __setstate__ attribute before the xarray object is initialized for source in self._attr_sources: @@ -205,20 +230,52 @@ def __getattr__(self, name: str) -> Any: "%r object has no attribute %r" % (type(self).__name__, name) ) - def __setattr__(self, name: str, value: Any) -> None: - if self._initialized: - try: - # Allow setting instance variables if they already exist - # (e.g., _attrs). We use __getattribute__ instead of hasattr - # to avoid key lookups with attribute-style access. - self.__getattribute__(name) - except AttributeError: - raise AttributeError( - "cannot set attribute %r on a %r object. Use __setitem__ " - "style assignment (e.g., `ds['name'] = ...`) instead to " - "assign variables." % (name, type(self).__name__) - ) + # This complicated three-method design boosts overall performance of simple + # operations - particularly DataArray methods that perform a _to_temp_dataset() + # round-trip - by a whopping 8% compared to a single method that checks + # hasattr(self, "__dict__") at runtime before every single assignment (like + # _setattr_py35 does). All of this is just temporary until the FutureWarning can be + # changed into a hard crash. + def _setattr_dict(self, name: str, value: Any) -> None: + """Deprecated third party subclass (see ``__init_subclass__`` above) + """ object.__setattr__(self, name, value) + if name in self.__dict__: + # Custom, non-slotted attr, or improperly assigned variable? + warnings.warn( + "Setting attribute %r on a %r object. Explicitly define __slots__ " + "to suppress this warning for legitimate custom attributes and " + "raise an error when attempting variables assignments." + % (name, type(self).__name__), + FutureWarning, + stacklevel=2, + ) + + def _setattr_slots(self, name: str, value: Any) -> None: + """Objects with ``__slots__`` raise AttributeError if you try setting an + undeclared attribute. This is desirable, but the error message could use some + improvement. + """ + try: + object.__setattr__(self, name, value) + except AttributeError as e: + # Don't accidentally shadow custom AttributeErrors, e.g. + # DataArray.dims.setter + if str(e) != "%r object has no attribute %r" % (type(self).__name__, name): + raise + raise AttributeError( + "cannot set attribute %r on a %r object. Use __setitem__ style" + "assignment (e.g., `ds['name'] = ...`) instead of assigning variables." + % (name, type(self).__name__) + ) from e + + def _setattr_py35(self, name: str, value: Any) -> None: + if hasattr(self, "__dict__"): + return self._setattr_dict(name, value) + return self._setattr_slots(name, value) + + # Overridden in Python >=3.6 by __init_subclass__ + __setattr__ = _setattr_py35 def __dir__(self) -> List[str]: """Provide method name lookup and completion. Only provide 'public' @@ -283,6 +340,8 @@ def get_squeeze_dims( class DataWithCoords(SupportsArithmetic, AttrAccessMixin): """Shared base class for Dataset and DataArray.""" + __slots__ = () + _rolling_exp_cls = RollingExp def squeeze( diff --git a/xarray/core/computation.py b/xarray/core/computation.py index da97106098f..424ab5be87a 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -51,6 +51,14 @@ class _UFuncSignature: Core dimension names on each output variable. """ + __slots__ = ( + "input_core_dims", + "output_core_dims", + "_all_input_core_dims", + "_all_output_core_dims", + "_all_core_dims", + ) + def __init__(self, input_core_dims, output_core_dims=((),)): self.input_core_dims = tuple(tuple(a) for a in input_core_dims) self.output_core_dims = tuple(tuple(a) for a in output_core_dims) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 82488f252f4..ddea5739fff 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -35,7 +35,7 @@ class AbstractCoordinates(Mapping[Hashable, "DataArray"]): - _data = None # type: Union["DataArray", "Dataset"] + __slots__ = () def __getitem__(self, key: Hashable) -> "DataArray": raise NotImplementedError() @@ -53,7 +53,7 @@ def dims(self) -> Union[Mapping[Hashable, int], Tuple[Hashable, ...]]: @property def indexes(self) -> Indexes: - return self._data.indexes + return self._data.indexes # type: ignore @property def variables(self): @@ -108,9 +108,9 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: raise ValueError("no valid index for a 0-dimensional object") elif len(ordered_dims) == 1: (dim,) = ordered_dims - return self._data.get_index(dim) + return self._data.get_index(dim) # type: ignore else: - indexes = [self._data.get_index(k) for k in ordered_dims] + indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore names = list(ordered_dims) return pd.MultiIndex.from_product(indexes, names=names) @@ -187,7 +187,7 @@ class DatasetCoordinates(AbstractCoordinates): objects. """ - _data = None # type: Dataset + __slots__ = ("_data",) def __init__(self, dataset: "Dataset"): self._data = dataset @@ -258,7 +258,7 @@ class DataArrayCoordinates(AbstractCoordinates): dimensions and the values given by corresponding DataArray objects. """ - _data = None # type: DataArray + __slots__ = ("_data",) def __init__(self, dataarray: "DataArray"): self._data = dataarray @@ -314,6 +314,8 @@ class LevelCoordinatesSource(Mapping[Hashable, Any]): by any public methods. """ + __slots__ = ("_data",) + def __init__(self, data_object: "Union[DataArray, Dataset]"): self._data = data_object diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4858e4d0e91..e5d53b1943a 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -182,6 +182,8 @@ def _check_data_shape(data, coords, dims): class _LocIndexer: + __slots__ = ("data_array",) + def __init__(self, data_array: "DataArray"): self.data_array = data_array @@ -246,6 +248,8 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ + __slots__ = ("_accessors", "_coords", "_file_obj", "_name", "_indexes", "_variable") + _groupby_cls = groupby.DataArrayGroupBy _rolling_cls = rolling.DataArrayRolling _coarsen_cls = rolling.DataArrayCoarsen @@ -356,6 +360,7 @@ def __init__( assert isinstance(coords, OrderedDict) self._coords = coords # type: OrderedDict[Any, Variable] self._name = name # type: Optional[Hashable] + self._accessors = None # type: Optional[Dict[str, Any]] # TODO(shoyer): document this argument, once it becomes part of the # public interface. @@ -363,8 +368,6 @@ def __init__( self._file_obj = None - self._initialized = True # type: bool - def _replace( self, variable: Variable = None, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 14237a244fd..f3ad4650b38 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -345,6 +345,8 @@ def as_dataset(obj: Any) -> "Dataset": class DataVariables(Mapping[Hashable, "DataArray"]): + __slots__ = ("_dataset",) + def __init__(self, dataset: "Dataset"): self._dataset = dataset @@ -384,6 +386,8 @@ def _ipython_key_completions_(self): class _LocIndexer: + __slots__ = ("dataset",) + def __init__(self, dataset: "Dataset"): self.dataset = dataset @@ -407,6 +411,17 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): coordinates used for label based indexing. """ + __slots__ = ( + "_accessors", + "_attrs", + "_coord_names", + "_dims", + "_encoding", + "_file_obj", + "_indexes", + "_variables", + ) + _groupby_cls = groupby.DatasetGroupBy _rolling_cls = rolling.DatasetRolling _coarsen_cls = rolling.DatasetCoarsen @@ -485,6 +500,7 @@ def __init__( self._variables = OrderedDict() # type: OrderedDict[Any, Variable] self._coord_names = set() # type: Set[Hashable] self._dims = {} # type: Dict[Any, int] + self._accessors = None # type: Optional[Dict[str, Any]] self._attrs = None # type: Optional[OrderedDict] self._file_obj = None if data_vars is None: @@ -500,7 +516,6 @@ def __init__( self._attrs = OrderedDict(attrs) self._encoding = None # type: Optional[Dict] - self._initialized = True def _set_init_vars_and_dims(self, data_vars, coords, compat): """Set the initial value of Dataset variables and dimensions @@ -839,7 +854,7 @@ def _construct_direct( obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding - obj._initialized = True + obj._accessors = None return obj __default = object() diff --git a/xarray/core/extensions.py b/xarray/core/extensions.py index 302a7fb2ec6..f473eaa497d 100644 --- a/xarray/core/extensions.py +++ b/xarray/core/extensions.py @@ -19,6 +19,14 @@ def __get__(self, obj, cls): if obj is None: # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor + + try: + return obj._accessors[self._name] + except TypeError: + obj._accessors = {} + except KeyError: + pass + try: accessor_obj = self._accessor(obj) except AttributeError: @@ -26,11 +34,8 @@ def __get__(self, obj, cls): # raised when initializing the accessor, so we need to raise as # something else (GH933): raise RuntimeError("error initializing %r accessor." % self._name) - # Replace the property with the accessor object. Inspired by: - # http://www.pydanny.com/cached-property.html - # We need to use object.__setattr__ because we overwrite __setattr__ on - # AttrAccessMixin. - object.__setattr__(obj, self._name, accessor_obj) + + obj._accessors[self._name] = accessor_obj return accessor_obj diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 5d81b13983d..41de4846e81 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -139,13 +139,24 @@ class _DummyGroup: Should not be user visible. """ + __slots__ = ("name", "coords", "size") + def __init__(self, obj, name, coords): self.name = name self.coords = coords - self.dims = (name,) - self.ndim = 1 self.size = obj.sizes[name] - self.values = range(self.size) + + @property + def dims(self): + return (self.name,) + + @property + def ndim(self): + return 1 + + @property + def values(self): + return range(self.size) def _ensure_1d(group, obj): @@ -216,6 +227,19 @@ class GroupBy(SupportsArithmetic): DataArray.groupby """ + __slots__ = ( + "_full_index", + "_inserted_dims", + "_group", + "_group_dim", + "_group_indices", + "_groups", + "_obj", + "_restore_coord_dims", + "_stacked_dim", + "_unique_coord", + ) + def __init__( self, obj, diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 5917f7c7a2d..94188fabc92 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -11,6 +11,8 @@ class Indexes(collections.abc.Mapping): """Immutable proxy for Dataset or DataArrary indexes.""" + __slots__ = ("_indexes",) + def __init__(self, indexes): """Not for public consumption. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f6570149484..c6a8f6f35e4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1,9 +1,10 @@ +import enum import functools import operator from collections import defaultdict from contextlib import suppress from datetime import timedelta -from typing import Any, Sequence, Tuple, Union +from typing import Any, Callable, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -327,6 +328,8 @@ class ExplicitIndexer: sub-classes BasicIndexer, OuterIndexer or VectorizedIndexer. """ + __slots__ = ("_key",) + def __init__(self, key): if type(self) is ExplicitIndexer: # noqa raise TypeError("cannot instantiate base ExplicitIndexer objects") @@ -359,6 +362,8 @@ class BasicIndexer(ExplicitIndexer): indexed with an integer are dropped from the result. """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -389,6 +394,8 @@ class OuterIndexer(ExplicitIndexer): indexing works like MATLAB/Fortran. """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -432,6 +439,8 @@ class VectorizedIndexer(ExplicitIndexer): https://github.com/numpy/numpy/pull/6256 """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -468,10 +477,15 @@ def __init__(self, key): class ExplicitlyIndexed: - """Mixin to mark support for Indexer subclasses in indexing.""" + """Mixin to mark support for Indexer subclasses in indexing. + """ + + __slots__ = () class ExplicitlyIndexedNDArrayMixin(utils.NDArrayMixin, ExplicitlyIndexed): + __slots__ = () + def __array__(self, dtype=None): key = BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) @@ -480,6 +494,8 @@ def __array__(self, dtype=None): class ImplicitToExplicitIndexingAdapter(utils.NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" + __slots__ = ("array", "indexer_cls") + def __init__(self, array, indexer_cls=BasicIndexer): self.array = as_indexable(array) self.indexer_cls = indexer_cls @@ -502,6 +518,8 @@ class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make basic and outer indexing lazy. """ + __slots__ = ("array", "key") + def __init__(self, array, key=None): """ Parameters @@ -577,6 +595,8 @@ class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make vectorized indexing lazy. """ + __slots__ = ("array", "key") + def __init__(self, array, key): """ Parameters @@ -631,6 +651,8 @@ def _wrap_numpy_scalars(array): class CopyOnWriteArray(ExplicitlyIndexedNDArrayMixin): + __slots__ = ("array", "_copied") + def __init__(self, array): self.array = as_indexable(array) self._copied = False @@ -655,6 +677,8 @@ def __setitem__(self, key, value): class MemoryCachedArray(ExplicitlyIndexedNDArrayMixin): + __slots__ = ("array",) + def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) @@ -783,18 +807,24 @@ def _combine_indexers(old_key, shape, new_key): ) -class IndexingSupport: # could inherit from enum.Enum on Python 3 +@enum.unique +class IndexingSupport(enum.Enum): # for backends that support only basic indexer - BASIC = "BASIC" + BASIC = 0 # for backends that support basic / outer indexer - OUTER = "OUTER" + OUTER = 1 # for backends that support outer indexer including at most 1 vector. - OUTER_1VECTOR = "OUTER_1VECTOR" + OUTER_1VECTOR = 2 # for backends that support full vectorized indexer. - VECTORIZED = "VECTORIZED" + VECTORIZED = 3 -def explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method): +def explicit_indexing_adapter( + key: ExplicitIndexer, + shape: Tuple[int, ...], + indexing_support: IndexingSupport, + raw_indexing_method: Callable, +) -> Any: """Support explicit indexing by delegating to a raw indexing method. Outer and/or vectorized indexers are supported by indexing a second time @@ -824,7 +854,9 @@ def explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method) return result -def decompose_indexer(indexer, shape, indexing_support): +def decompose_indexer( + indexer: ExplicitIndexer, shape: Tuple[int, ...], indexing_support: IndexingSupport +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: if isinstance(indexer, VectorizedIndexer): return _decompose_vectorized_indexer(indexer, shape, indexing_support) if isinstance(indexer, (BasicIndexer, OuterIndexer)): @@ -848,7 +880,11 @@ def _decompose_slice(key, size): return slice(start, stop, -step), slice(None, None, -1) -def _decompose_vectorized_indexer(indexer, shape, indexing_support): +def _decompose_vectorized_indexer( + indexer: VectorizedIndexer, + shape: Tuple[int, ...], + indexing_support: IndexingSupport, +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: """ Decompose vectorized indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -884,45 +920,49 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): if indexing_support is IndexingSupport.VECTORIZED: return indexer, BasicIndexer(()) - backend_indexer = [] - np_indexer = [] + backend_indexer_elems = [] + np_indexer_elems = [] # convert negative indices - indexer = [ + indexer_elems = [ np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k for k, s in zip(indexer.tuple, shape) ] - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, slice): # If it is a slice, then we will slice it as-is # (but make its step positive) in the backend, # and then use all of it (slice(None)) for the in-memory portion. bk_slice, np_slice = _decompose_slice(k, s) - backend_indexer.append(bk_slice) - np_indexer.append(np_slice) + backend_indexer_elems.append(bk_slice) + np_indexer_elems.append(np_slice) else: # If it is a (multidimensional) np.ndarray, just pickup the used # keys without duplication and store them as a 1d-np.ndarray. oind, vind = np.unique(k, return_inverse=True) - backend_indexer.append(oind) - np_indexer.append(vind.reshape(*k.shape)) + backend_indexer_elems.append(oind) + np_indexer_elems.append(vind.reshape(*k.shape)) - backend_indexer = OuterIndexer(tuple(backend_indexer)) - np_indexer = VectorizedIndexer(tuple(np_indexer)) + backend_indexer = OuterIndexer(tuple(backend_indexer_elems)) + np_indexer = VectorizedIndexer(tuple(np_indexer_elems)) if indexing_support is IndexingSupport.OUTER: return backend_indexer, np_indexer # If the backend does not support outer indexing, # backend_indexer (OuterIndexer) is also decomposed. - backend_indexer, np_indexer1 = _decompose_outer_indexer( + backend_indexer1, np_indexer1 = _decompose_outer_indexer( backend_indexer, shape, indexing_support ) np_indexer = _combine_indexers(np_indexer1, shape, np_indexer) - return backend_indexer, np_indexer + return backend_indexer1, np_indexer -def _decompose_outer_indexer(indexer, shape, indexing_support): +def _decompose_outer_indexer( + indexer: Union[BasicIndexer, OuterIndexer], + shape: Tuple[int, ...], + indexing_support: IndexingSupport, +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: """ Decompose outer indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -930,7 +970,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): Parameters ---------- - indexer: VectorizedIndexer + indexer: OuterIndexer or BasicIndexer indexing_support: One of the entries of IndexingSupport Returns @@ -968,7 +1008,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): pos_indexer.append(k + s) else: pos_indexer.append(k) - indexer = pos_indexer + indexer_elems = pos_indexer if indexing_support is IndexingSupport.OUTER_1VECTOR: # some backends such as h5py supports only 1 vector in indexers @@ -977,11 +1017,11 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): (np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) if isinstance(k, np.ndarray) else 0 - for k in indexer + for k in indexer_elems ] array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None - for i, (k, s) in enumerate(zip(indexer, shape)): + for i, (k, s) in enumerate(zip(indexer_elems, shape)): if isinstance(k, np.ndarray) and i != array_index: # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -1002,7 +1042,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): return (OuterIndexer(tuple(backend_indexer)), OuterIndexer(tuple(np_indexer))) if indexing_support == IndexingSupport.OUTER: - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, slice): # slice: convert positive step slice for backend bk_slice, np_slice = _decompose_slice(k, s) @@ -1024,7 +1064,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): # basic indexer assert indexing_support == IndexingSupport.BASIC - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, np.ndarray): # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -1199,6 +1239,8 @@ def posify_mask_indexer(indexer): class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a NumPy array to use explicit indexing.""" + __slots__ = ("array",) + def __init__(self, array): # In NumpyIndexingAdapter we only allow to store bare np.ndarray if not isinstance(array, np.ndarray): @@ -1249,6 +1291,8 @@ def __setitem__(self, key, value): class NdArrayLikeIndexingAdapter(NumpyIndexingAdapter): + __slots__ = ("array",) + def __init__(self, array): if not hasattr(array, "__array_function__"): raise TypeError( @@ -1261,6 +1305,8 @@ def __init__(self, array): class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a dask array to support explicit indexing.""" + __slots__ = ("array",) + def __init__(self, array): """ This adapter is created in Variable.__getitem__ in Variable._broadcast_indexes. @@ -1302,6 +1348,8 @@ class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing. """ + __slots__ = ("array", "_dtype") + def __init__(self, array: Any, dtype: DTypeLike = None): self.array = utils.safe_cast_to_index(array) if dtype is None: diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 592cae9007e..a812e7472ca 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -43,7 +43,8 @@ class Rolling: DataArray.rolling """ - _attributes = ["window", "min_periods", "center", "dim"] + __slots__ = ("obj", "window", "min_periods", "center", "dim") + _attributes = ("window", "min_periods", "center", "dim") def __init__(self, obj, windows, min_periods=None, center=False): """ @@ -93,17 +94,17 @@ def __init__(self, obj, windows, min_periods=None, center=False): # attributes self.window = window + if min_periods is not None and min_periods <= 0: + raise ValueError("min_periods must be greater than zero or None") self.min_periods = min_periods - if min_periods is None: - self._min_periods = window - else: - if min_periods <= 0: - raise ValueError("min_periods must be greater than zero or None") - self._min_periods = min_periods self.center = center self.dim = dim + @property + def _min_periods(self): + return self.min_periods if self.min_periods is not None else self.window + def __repr__(self): """provide a nice str repr of our rolling object""" @@ -152,6 +153,8 @@ def count(self): class DataArrayRolling(Rolling): + __slots__ = ("window_labels",) + def __init__(self, obj, windows, min_periods=None, center=False): """ Moving window object for DataArray. @@ -381,6 +384,8 @@ def _numpy_or_bottleneck_reduce( class DatasetRolling(Rolling): + __slots__ = ("rollings",) + def __init__(self, obj, windows, min_periods=None, center=False): """ Moving window object for Dataset. @@ -516,7 +521,8 @@ class Coarsen: DataArray.coarsen """ - _attributes = ["windows", "side", "trim_excess"] + __slots__ = ("obj", "boundary", "coord_func", "windows", "side", "trim_excess") + _attributes = ("windows", "side", "trim_excess") def __init__(self, obj, windows, boundary, side, coord_func): """ @@ -569,6 +575,8 @@ def __repr__(self): class DataArrayCoarsen(Coarsen): + __slots__ = () + @classmethod def _reduce_method(cls, func): """ @@ -599,6 +607,8 @@ def wrapped_func(self, **kwargs): class DatasetCoarsen(Coarsen): + __slots__ = () + @classmethod def _reduce_method(cls, func): """ diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 9e0037b4da0..0d730edeaeb 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -376,7 +376,7 @@ class Frozen(Mapping[K, V]): saved under the `mapping` attribute. """ - __slots__ = ["mapping"] + __slots__ = ("mapping",) def __init__(self, mapping: Mapping[K, V]): self.mapping = mapping @@ -407,7 +407,7 @@ class SortedKeysDict(MutableMapping[K, V]): mapping. """ - __slots__ = ["mapping"] + __slots__ = ("mapping",) def __init__(self, mapping: MutableMapping[K, V] = None): self.mapping = {} if mapping is None else mapping @@ -441,6 +441,8 @@ class OrderedSet(MutableSet[T]): elements, like an OrderedDict. """ + __slots__ = ("_ordered_dict",) + def __init__(self, values: AbstractSet[T] = None): self._ordered_dict = OrderedDict() # type: MutableMapping[T, None] if values is not None: @@ -481,6 +483,8 @@ class NdimSizeLenMixin: one that also defines ``ndim``, ``size`` and ``__len__``. """ + __slots__ = () + @property def ndim(self: Any) -> int: return len(self.shape) @@ -505,6 +509,8 @@ class NDArrayMixin(NdimSizeLenMixin): `dtype`, `shape` and `__getitem__`. """ + __slots__ = () + @property def dtype(self: Any) -> np.dtype: return self.array.dtype @@ -618,6 +624,8 @@ class HiddenKeyDict(MutableMapping[K, V]): """Acts like a normal dictionary, but hides certain keys. """ + __slots__ = ("_data", "_hidden_keys") + # ``__init__`` method required to create instance from class. def __init__(self, data: MutableMapping[K, V], hidden_keys: Iterable[K]): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c64dd8af6c6..ac4f7052f14 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -267,6 +267,8 @@ class Variable( they can use more complete metadata in context of coordinate labels. """ + __slots__ = ("_dims", "_data", "_attrs", "_encoding") + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): """ Parameters @@ -1936,6 +1938,8 @@ class IndexVariable(Variable): unless another name is given. """ + __slots__ = () + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): super().__init__(dims, data, attrs, encoding, fastpath) if self.ndim != 1: diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index 79f94077c8f..ec51ff26c07 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -67,7 +67,6 @@ class FacetGrid: Contains dictionaries mapping coordinate names to values. None is used as a sentinel value for axes which should remain empty, ie. sometimes the bottom right grid - """ def __init__( diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 14f03d42fe7..8ca62ef58f1 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -452,6 +452,8 @@ class _PlotMethods: For example, DataArray.plot.imshow """ + __slots__ = ("_da",) + def __init__(self, darray): self._da = darray diff --git a/xarray/testing.py b/xarray/testing.py index fbb5904c678..9fa58b64001 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -197,8 +197,6 @@ def _assert_dataarray_invariants(da: DataArray): if da._indexes is not None: _assert_indexes_invariants_checks(da._indexes, da._coords, da.dims) - assert da._initialized is True - def _assert_dataset_invariants(ds: Dataset): assert isinstance(ds._variables, OrderedDict), type(ds._variables) @@ -235,7 +233,6 @@ def _assert_dataset_invariants(ds: Dataset): assert isinstance(ds._encoding, (type(None), dict)) assert isinstance(ds._attrs, (type(None), OrderedDict)) - assert ds._initialized is True def _assert_internal_invariants(xarray_obj: Union[DataArray, Dataset, Variable],): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5ab05b94177..2fc86d777aa 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4616,3 +4616,25 @@ def test_rolling_exp(da, dim, window_type, window): ) assert_allclose(expected.variable, result.variable) + + +def test_no_dict(): + d = DataArray() + with pytest.raises(AttributeError): + d.__dict__ + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_subclass_slots(): + """Test that DataArray subclasses must explicitly define ``__slots__``. + + .. note:: + As of 0.13.0, this is actually mitigated into a FutureWarning for any class + defined outside of the xarray package. + """ + with pytest.raises(AttributeError) as e: + + class MyArray(DataArray): + pass + + assert str(e.value) == "MyArray must explicitly define __slots__" diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 3c2b9b6ce8f..3953e6c4146 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5706,3 +5706,25 @@ def test_trapz_datetime(dask, which_datetime): actual2 = da.integrate("time", datetime_unit="h") assert_allclose(actual, actual2 / 24.0) + + +def test_no_dict(): + d = Dataset() + with pytest.raises(AttributeError): + d.__dict__ + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_subclass_slots(): + """Test that Dataset subclasses must explicitly define ``__slots__``. + + .. note:: + As of 0.13.0, this is actually mitigated into a FutureWarning for any class + defined outside of the xarray package. + """ + with pytest.raises(AttributeError) as e: + + class MyDS(Dataset): + pass + + assert str(e.value) == "MyDS must explicitly define __slots__" From 683aaf66bb84d380f2f8c37b4e2c49a17e7148b9 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Sat, 31 Aug 2019 05:18:21 -0400 Subject: [PATCH 18/43] Don't set box-forced in Cartopy example. (#3273) It is deprecated in Matplotlib 2.2, removed in 3.1, and appears to have no effect on the result. --- doc/gallery/plot_cartopy_facetgrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/gallery/plot_cartopy_facetgrid.py b/doc/gallery/plot_cartopy_facetgrid.py index a0afa7ad92e..af04ad6856a 100644 --- a/doc/gallery/plot_cartopy_facetgrid.py +++ b/doc/gallery/plot_cartopy_facetgrid.py @@ -41,6 +41,6 @@ ax.set_extent([-160, -30, 5, 75]) # Without this aspect attributes the maps will look chaotic and the # "extent" attribute above will be ignored - ax.set_aspect("equal", "box-forced") + ax.set_aspect("equal") plt.show() From 5c6aebccf4eeedbe9de186836a0913f00ea157db Mon Sep 17 00:00:00 2001 From: Gerardo Rivera Date: Wed, 4 Sep 2019 23:22:23 -0500 Subject: [PATCH 19/43] Add head, tail and thin methods (#3278) * Add head, tail and thin methods * Update api and whats-new * Fix pep8 issues * Fix typo * Tests for DataArray --- doc/api.rst | 6 +++ doc/whats-new.rst | 5 +- xarray/core/dataarray.py | 49 ++++++++++++++++++++ xarray/core/dataset.py | 84 ++++++++++++++++++++++++++++++++++ xarray/tests/test_dataarray.py | 13 ++++++ xarray/tests/test_dataset.py | 32 +++++++++++++ 6 files changed, 188 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index 872e7786e1b..fb6e037a4f2 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -117,6 +117,9 @@ Indexing Dataset.loc Dataset.isel Dataset.sel + Dataset.head + Dataset.tail + Dataset.thin Dataset.squeeze Dataset.interp Dataset.interp_like @@ -279,6 +282,9 @@ Indexing DataArray.loc DataArray.isel DataArray.sel + Dataset.head + Dataset.tail + Dataset.thin DataArray.squeeze DataArray.interp DataArray.interp_like diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8796c79da4c..1e5855df51f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -87,6 +87,9 @@ New functions/methods Currently only :py:meth:`Dataset.plot.scatter` is implemented. By `Yohai Bar Sinai `_ and `Deepak Cherian `_ +- Added `head`, `tail` and `thin` methods to `Dataset` and `DataArray`. (:issue:`319`) + By `Gerardo Rivera `_. + Enhancements ~~~~~~~~~~~~ @@ -102,7 +105,7 @@ Enhancements - Added the ability to initialize an empty or full DataArray with a single value. (:issue:`277`) - By `Gerardo Rivera `_. + By `Gerardo Rivera `_. - :py:func:`~xarray.Dataset.to_netcdf()` now supports the ``invalid_netcdf`` kwarg when used with ``engine="h5netcdf"``. It is passed to :py:func:`h5netcdf.File`. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e5d53b1943a..8660fa952b1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1040,6 +1040,55 @@ def sel( ) return self._from_temp_dataset(ds) + def head( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "DataArray": + """Return a new DataArray whose data is given by the the first `n` + values along the specified dimension(s). + + See Also + -------- + Dataset.head + DataArray.tail + DataArray.thin + """ + + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "head") + ds = self._to_temp_dataset().head(indexers=indexers) + return self._from_temp_dataset(ds) + + def tail( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "DataArray": + """Return a new DataArray whose data is given by the the last `n` + values along the specified dimension(s). + + See Also + -------- + Dataset.tail + DataArray.head + DataArray.thin + """ + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "tail") + ds = self._to_temp_dataset().tail(indexers=indexers) + return self._from_temp_dataset(ds) + + def thin( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "DataArray": + """Return a new DataArray whose data is given by each `n` value + along the specified dimension(s). + + See Also + -------- + Dataset.thin + DataArray.head + DataArray.tail + """ + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "thin") + ds = self._to_temp_dataset().thin(indexers=indexers) + return self._from_temp_dataset(ds) + def broadcast_like( self, other: Union["DataArray", Dataset], exclude: Iterable[Hashable] = None ) -> "DataArray": diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f3ad4650b38..1476c1ba646 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2008,6 +2008,90 @@ def sel( result = self.isel(indexers=pos_indexers, drop=drop) return result._overwrite_indexes(new_indexes) + def head( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with the first `n` values of each array + for the specified dimension(s). + + Parameters + ---------- + indexers : dict, optional + A dict with keys matching dimensions and integer values `n`. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. + + + See Also + -------- + Dataset.tail + Dataset.thin + DataArray.head + """ + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "head") + indexers = {k: slice(val) for k, val in indexers.items()} + return self.isel(indexers) + + def tail( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with the last `n` values of each array + for the specified dimension(s). + + Parameters + ---------- + indexers : dict, optional + A dict with keys matching dimensions and integer values `n`. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. + + + See Also + -------- + Dataset.head + Dataset.thin + DataArray.tail + """ + + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "tail") + indexers = { + k: slice(-val, None) if val != 0 else slice(val) + for k, val in indexers.items() + } + return self.isel(indexers) + + def thin( + self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with each array indexed along every `n`th + value for the specified dimension(s) + + Parameters + ---------- + indexers : dict, optional + A dict with keys matching dimensions and integer values `n`. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. + + + See Also + -------- + Dataset.head + Dataset.tail + DataArray.thin + """ + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "thin") + if 0 in indexers.values(): + raise ValueError("step cannot be zero") + indexers = {k: slice(None, None, val) for k, val in indexers.items()} + return self.isel(indexers) + def broadcast_like( self, other: Union["Dataset", "DataArray"], exclude: Iterable[Hashable] = None ) -> "Dataset": diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 2fc86d777aa..27e6ab92f71 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1002,6 +1002,19 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) + def test_head(self): + assert_equal(self.dv.isel(x=slice(5)), self.dv.head(x=5)) + assert_equal(self.dv.isel(x=slice(0)), self.dv.head(x=0)) + + def test_tail(self): + assert_equal(self.dv.isel(x=slice(-5, None)), self.dv.tail(x=5)) + assert_equal(self.dv.isel(x=slice(0)), self.dv.tail(x=0)) + + def test_thin(self): + assert_equal(self.dv.isel(x=slice(None, None, 5)), self.dv.thin(x=5)) + with raises_regex(ValueError, "cannot be zero"): + self.dv.thin(time=0) + def test_loc(self): self.ds["x"] = ("x", np.array(list("abcdefghij"))) da = self.ds["foo"] diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 3953e6c4146..d9f0284969e 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1411,6 +1411,38 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) + def test_head(self): + data = create_test_data() + + expected = data.isel(time=slice(5), dim2=slice(6)) + actual = data.head(time=5, dim2=6) + assert_equal(expected, actual) + + expected = data.isel(time=slice(0)) + actual = data.head(time=0) + assert_equal(expected, actual) + + def test_tail(self): + data = create_test_data() + + expected = data.isel(time=slice(-5, None), dim2=slice(-6, None)) + actual = data.tail(time=5, dim2=6) + assert_equal(expected, actual) + + expected = data.isel(dim1=slice(0)) + actual = data.tail(dim1=0) + assert_equal(expected, actual) + + def test_thin(self): + data = create_test_data() + + expected = data.isel(time=slice(None, None, 5), dim2=slice(None, None, 6)) + actual = data.thin(time=5, dim2=6) + assert_equal(expected, actual) + + with raises_regex(ValueError, "cannot be zero"): + data.thin(time=0) + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sel_fancy(self): data = create_test_data() From 0a046dbdeff409728b4e9bf55fba9d2aae9acd07 Mon Sep 17 00:00:00 2001 From: ulijh Date: Sat, 7 Sep 2019 01:15:18 +0200 Subject: [PATCH 20/43] Make argmin/max work lazy with dask (#3244) * Make argmin/max work lazy with dask (#3237). * dask: Testing number of computes on reduce methods. * what's new updated * Fix typo Co-Authored-By: Stephan Hoyer * Be more explicit. Co-Authored-By: Stephan Hoyer * More explicit raise_if_dask_computes * nanargmin/max: only set fill_value when needed --- doc/whats-new.rst | 2 ++ xarray/core/nanops.py | 29 +++++---------------- xarray/core/nputils.py | 2 ++ xarray/tests/test_dask.py | 54 ++++++++++++++++++++++++++++++++++----- 4 files changed, 57 insertions(+), 30 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1e5855df51f..61a1fa59388 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -156,6 +156,8 @@ Bug fixes - Fix error that arises when using open_mfdataset on a series of netcdf files having differing values for a variable attribute of type list. (:issue:`3034`) By `Hasan Ahmad `_. +- Prevent :py:meth:`~xarray.DataArray.argmax` and :py:meth:`~xarray.DataArray.argmin` from calling + dask compute (:issue:`3237`). By `Ulrich Herter `_. .. _whats-new.0.12.3: diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index 9ba4eae29ae..17240faf007 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -88,38 +88,21 @@ def nanmax(a, axis=None, out=None): def nanargmin(a, axis=None): - fill_value = dtypes.get_pos_infinity(a.dtype) if a.dtype.kind == "O": + fill_value = dtypes.get_pos_infinity(a.dtype) return _nan_argminmax_object("argmin", fill_value, a, axis=axis) - a, mask = _replace_nan(a, fill_value) - if isinstance(a, dask_array_type): - res = dask_array.argmin(a, axis=axis) - else: - res = np.argmin(a, axis=axis) - if mask is not None: - mask = mask.all(axis=axis) - if mask.any(): - raise ValueError("All-NaN slice encountered") - return res + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanargmin(a, axis=axis) def nanargmax(a, axis=None): - fill_value = dtypes.get_neg_infinity(a.dtype) if a.dtype.kind == "O": + fill_value = dtypes.get_neg_infinity(a.dtype) return _nan_argminmax_object("argmax", fill_value, a, axis=axis) - a, mask = _replace_nan(a, fill_value) - if isinstance(a, dask_array_type): - res = dask_array.argmax(a, axis=axis) - else: - res = np.argmax(a, axis=axis) - - if mask is not None: - mask = mask.all(axis=axis) - if mask.any(): - raise ValueError("All-NaN slice encountered") - return res + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanargmax(a, axis=axis) def nansum(a, axis=None, dtype=None, out=None, min_count=None): diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 769af03fe6a..df36c98f94c 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -237,3 +237,5 @@ def f(values, axis=None, **kwargs): nanprod = _create_bottleneck_method("nanprod") nancumsum = _create_bottleneck_method("nancumsum") nancumprod = _create_bottleneck_method("nancumprod") +nanargmin = _create_bottleneck_method("nanargmin") +nanargmax = _create_bottleneck_method("nanargmax") diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index e3fc6f65e0f..d105765481e 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -27,14 +27,49 @@ dd = pytest.importorskip("dask.dataframe") +class CountingScheduler: + """ Simple dask scheduler counting the number of computes. + + Reference: https://stackoverflow.com/questions/53289286/ """ + + def __init__(self, max_computes=0): + self.total_computes = 0 + self.max_computes = max_computes + + def __call__(self, dsk, keys, **kwargs): + self.total_computes += 1 + if self.total_computes > self.max_computes: + raise RuntimeError( + "Too many computes. Total: %d > max: %d." + % (self.total_computes, self.max_computes) + ) + return dask.get(dsk, keys, **kwargs) + + +def _set_dask_scheduler(scheduler=dask.get): + """ Backwards compatible way of setting scheduler. """ + if LooseVersion(dask.__version__) >= LooseVersion("0.18.0"): + return dask.config.set(scheduler=scheduler) + return dask.set_options(get=scheduler) + + +def raise_if_dask_computes(max_computes=0): + scheduler = CountingScheduler(max_computes) + return _set_dask_scheduler(scheduler) + + +def test_raise_if_dask_computes(): + data = da.from_array(np.random.RandomState(0).randn(4, 6), chunks=(2, 2)) + with raises_regex(RuntimeError, "Too many computes"): + with raise_if_dask_computes(): + data.compute() + + class DaskTestCase: def assertLazyAnd(self, expected, actual, test): - - with ( - dask.config.set(scheduler="single-threaded") - if LooseVersion(dask.__version__) >= LooseVersion("0.18.0") - else dask.set_options(get=dask.get) - ): + with _set_dask_scheduler(dask.get): + # dask.get is the syncronous scheduler, which get's set also by + # dask.config.set(scheduler="syncronous") in current versions. test(actual, expected) if isinstance(actual, Dataset): @@ -174,7 +209,12 @@ def test_reduce(self): v = self.lazy_var self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(u.std(), v.std()) - self.assertLazyAndAllClose(u.argmax(dim="x"), v.argmax(dim="x")) + with raise_if_dask_computes(): + actual = v.argmax(dim="x") + self.assertLazyAndAllClose(u.argmax(dim="x"), actual) + with raise_if_dask_computes(): + actual = v.argmin(dim="x") + self.assertLazyAndAllClose(u.argmin(dim="x"), actual) self.assertLazyAndAllClose((u > 1).any(), (v > 1).any()) self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) with raises_regex(NotImplementedError, "dask"): From d1260443d065c3f2ec3f8eb3d999c59a695b35a2 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sun, 8 Sep 2019 18:58:15 -0400 Subject: [PATCH 21/43] Remove some deprecations (#3292) * remove some deprecations * whatsnew --- doc/whats-new.rst | 7 +++++++ xarray/__init__.py | 2 +- xarray/core/alignment.py | 21 ++------------------- xarray/core/dataarray.py | 12 +++--------- xarray/core/variable.py | 10 ---------- xarray/tests/test_dataarray.py | 12 ++++-------- xarray/tests/test_dataset.py | 6 ++---- 7 files changed, 19 insertions(+), 51 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 61a1fa59388..e65f052ca8c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,6 +49,13 @@ Breaking changes crash in a later release. (:issue:`3250`) by `Guido Imperiale `_. +- :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous + positional arguments were deprecated) +- Reindexing with variables of a different dimension now raise an error (previously deprecated) +- :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of + :py:func:`~xarray.broadcast`) +- :py:meth:`~Variable.expand_dims` is removed (previously deprecated in favor of + :py:meth:`~Variable.set_dims`) New functions/methods ~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/__init__.py b/xarray/__init__.py index a3df034f7c7..cdca708e28c 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -6,7 +6,7 @@ __version__ = get_versions()["version"] del get_versions -from .core.alignment import align, broadcast, broadcast_arrays +from .core.alignment import align, broadcast from .core.common import full_like, zeros_like, ones_like from .core.concat import concat from .core.combine import combine_by_coords, combine_nested, auto_combine diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 9aeef63e891..d63718500bc 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -1,6 +1,5 @@ import functools import operator -import warnings from collections import OrderedDict, defaultdict from contextlib import suppress from typing import TYPE_CHECKING, Any, Dict, Hashable, Mapping, Optional, Tuple, Union @@ -387,14 +386,9 @@ def reindex_variables( for dim, indexer in indexers.items(): if isinstance(indexer, DataArray) and indexer.dims != (dim,): - warnings.warn( + raise ValueError( "Indexer has dimensions {:s} that are different " - "from that to be indexed along {:s}. " - "This will behave differently in the future.".format( - str(indexer.dims), dim - ), - FutureWarning, - stacklevel=3, + "from that to be indexed along {:s}".format(str(indexer.dims), dim) ) target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim]) @@ -592,14 +586,3 @@ def broadcast(*args, exclude=None): result.append(_broadcast_helper(arg, exclude, dims_map, common_coords)) return tuple(result) - - -def broadcast_arrays(*args): - import warnings - - warnings.warn( - "xarray.broadcast_arrays is deprecated: use " "xarray.broadcast instead", - DeprecationWarning, - stacklevel=2, - ) - return broadcast(*args) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8660fa952b1..a3655e2c4b2 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -471,7 +471,7 @@ def _to_dataset_whole( dataset = Dataset._from_vars_and_coord_names(variables, coord_names) return dataset - def to_dataset(self, dim: Hashable = None, name: Hashable = None) -> Dataset: + def to_dataset(self, dim: Hashable = None, *, name: Hashable = None) -> Dataset: """Convert a DataArray to a Dataset. Parameters @@ -489,15 +489,9 @@ def to_dataset(self, dim: Hashable = None, name: Hashable = None) -> Dataset: dataset : Dataset """ if dim is not None and dim not in self.dims: - warnings.warn( - "the order of the arguments on DataArray.to_dataset " - "has changed; you now need to supply ``name`` as " - "a keyword argument", - FutureWarning, - stacklevel=2, + raise TypeError( + "{} is not a dim. If supplying a ``name``, pass as a kwarg.".format(dim) ) - name = dim - dim = None if dim is not None: if name is not None: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ac4f7052f14..2e9906ce5ae 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1226,16 +1226,6 @@ def transpose(self, *dims) -> "Variable": def T(self) -> "Variable": return self.transpose() - def expand_dims(self, *args): - import warnings - - warnings.warn( - "Variable.expand_dims is deprecated: use " "Variable.set_dims instead", - DeprecationWarning, - stacklevel=2, - ) - return self.expand_dims(*args) - def set_dims(self, dims, shape=None): """Return a new variable with given set of dimensions. This method might be used to attach new dimension(s) to variable. diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 27e6ab92f71..8c01ef9a68c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1408,13 +1408,11 @@ def test_reindex_like_no_index(self): with raises_regex(ValueError, "different size for unlabeled"): foo.reindex_like(bar) - @pytest.mark.filterwarnings("ignore:Indexer has dimensions") def test_reindex_regressions(self): - # regression test for #279 - expected = DataArray(np.random.randn(5), coords=[("time", range(5))]) + da = DataArray(np.random.randn(5), coords=[("time", range(5))]) time2 = DataArray(np.arange(5), dims="time2") - actual = expected.reindex(time=time2) - assert_identical(actual, expected) + with pytest.raises(ValueError): + da.reindex(time=time2) # regression test for #736, reindex can not change complex nums dtype x = np.array([1, 2, 3], dtype=np.complex) @@ -3685,10 +3683,8 @@ def test_to_dataset_whole(self): expected = Dataset({"foo": ("x", [1, 2])}) assert_identical(expected, actual) - expected = Dataset({"bar": ("x", [1, 2])}) - with pytest.warns(FutureWarning): + with pytest.raises(TypeError): actual = named.to_dataset("bar") - assert_identical(expected, actual) def test_to_dataset_split(self): array = DataArray([1, 2, 3], coords=[("x", list("abc"))], attrs={"a": 1}) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d9f0284969e..814fc31d734 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1689,9 +1689,8 @@ def test_reindex(self): # regression test for #279 expected = Dataset({"x": ("time", np.random.randn(5))}, {"time": range(5)}) time2 = DataArray(np.arange(5), dims="time2") - with pytest.warns(FutureWarning): + with pytest.raises(ValueError): actual = expected.reindex(time=time2) - assert_identical(actual, expected) # another regression test ds = Dataset( @@ -1707,11 +1706,10 @@ def test_reindex(self): def test_reindex_warning(self): data = create_test_data() - with pytest.warns(FutureWarning) as ws: + with pytest.raises(ValueError): # DataArray with different dimension raises Future warning ind = xr.DataArray([0.0, 1.0], dims=["new_dim"], name="ind") data.reindex(dim2=ind) - assert any(["Indexer has dimensions " in str(w.message) for w in ws]) # Should not warn ind = xr.DataArray([0.0, 1.0], dims=["dim2"], name="ind") From 9e1c690e6da93314acf801eba649c98a97649c58 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 9 Sep 2019 20:31:15 +0200 Subject: [PATCH 22/43] allow np-array levels and colors in 2D plots (#3295) * test if levels is None * allow np levels and color list * whats-new * Update doc/whats-new.rst Co-Authored-By: Deepak Cherian --- doc/whats-new.rst | 2 ++ xarray/plot/utils.py | 2 +- xarray/tests/test_plot.py | 28 ++++++++++++++++++++-------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e65f052ca8c..4e975c55d47 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -165,6 +165,8 @@ Bug fixes By `Hasan Ahmad `_. - Prevent :py:meth:`~xarray.DataArray.argmax` and :py:meth:`~xarray.DataArray.argmin` from calling dask compute (:issue:`3237`). By `Ulrich Herter `_. +- Plots in 2 dimensions (pcolormesh, contour) now allow to specify levels as numpy + array (:issue:`3284`). By `Mathias Hauser `_. .. _whats-new.0.12.3: diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 2d50734f519..53bbe8bacb9 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -731,7 +731,7 @@ def _process_cmap_cbar_kwargs( # colors is only valid when levels is supplied or the plot is of type # contour or contourf - if colors and (("contour" not in func.__name__) and (not levels)): + if colors and (("contour" not in func.__name__) and (levels is None)): raise ValueError("Can only specify colors with contour or levels") # we should not be getting a list of colors in cmap anymore diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index a1c05971ec4..c9b041b3ba7 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -1283,26 +1283,38 @@ class TestContour(Common2dMixin, PlotTestCase): plotfunc = staticmethod(xplt.contour) + # matplotlib cmap.colors gives an rgbA ndarray + # when seaborn is used, instead we get an rgb tuple + @staticmethod + def _color_as_tuple(c): + return tuple(c[:3]) + def test_colors(self): - # matplotlib cmap.colors gives an rgbA ndarray - # when seaborn is used, instead we get an rgb tuple - def _color_as_tuple(c): - return tuple(c[:3]) # with single color, we don't want rgb array artist = self.plotmethod(colors="k") assert artist.cmap.colors[0] == "k" artist = self.plotmethod(colors=["k", "b"]) - assert _color_as_tuple(artist.cmap.colors[1]) == (0.0, 0.0, 1.0) + assert self._color_as_tuple(artist.cmap.colors[1]) == (0.0, 0.0, 1.0) artist = self.darray.plot.contour( levels=[-0.5, 0.0, 0.5, 1.0], colors=["k", "r", "w", "b"] ) - assert _color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) - assert _color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) + assert self._color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) + assert self._color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) + # the last color is now under "over" + assert self._color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) + + def test_colors_np_levels(self): + + # https://github.com/pydata/xarray/issues/3284 + levels = np.array([-0.5, 0.0, 0.5, 1.0]) + artist = self.darray.plot.contour(levels=levels, colors=["k", "r", "w", "b"]) + assert self._color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) + assert self._color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) # the last color is now under "over" - assert _color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) + assert self._color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) def test_cmap_and_color_both(self): with pytest.raises(ValueError): From e38ca0f168ebc2c52857a2abd45572a6e92beca8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Sep 2019 18:34:13 +0000 Subject: [PATCH 23/43] Remove deprecated concat kwargs. (#3288) --- doc/whats-new.rst | 5 ++++- xarray/core/concat.py | 38 +------------------------------------ xarray/tests/test_concat.py | 5 ----- 3 files changed, 5 insertions(+), 43 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4e975c55d47..f5e0f9c467f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -30,7 +30,10 @@ Breaking changes By `Maximilian Roos `_ - The ``inplace`` kwarg for public methods now raises an error, having been deprecated since v0.11.0. - By `Maximilian Roos `_ + By `Maximilian Roos `_ +- :py:func:`~xarray.concat` now requires the ``dim`` argument. Its ``indexers``, ``mode`` + and ``concat_over`` kwargs have now been removed. + By `Deepak Cherian `_ - Most xarray objects now define ``__slots__``. This reduces overall RAM usage by ~22% (not counting the underlying numpy buffers); on CPython 3.7/x64, a trivial DataArray has gone down from 1.9kB to 1.5kB. diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 9c7c622a31c..d5dfa49a8d5 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,4 +1,3 @@ -import warnings from collections import OrderedDict import pandas as pd @@ -11,14 +10,11 @@ def concat( objs, - dim=None, + dim, data_vars="all", coords="different", compat="equals", positions=None, - indexers=None, - mode=None, - concat_over=None, fill_value=dtypes.NA, join="outer", ): @@ -111,38 +107,6 @@ def concat( except StopIteration: raise ValueError("must supply at least one object to concatenate") - if dim is None: - warnings.warn( - "the `dim` argument to `concat` will be required " - "in a future version of xarray; for now, setting it to " - "the old default of 'concat_dim'", - FutureWarning, - stacklevel=2, - ) - dim = "concat_dims" - - if indexers is not None: # pragma: no cover - warnings.warn( - "indexers has been renamed to positions; the alias " - "will be removed in a future version of xarray", - FutureWarning, - stacklevel=2, - ) - positions = indexers - - if mode is not None: - raise ValueError( - "`mode` is no longer a valid argument to " - "xarray.concat; it has been split into the " - "`data_vars` and `coords` arguments" - ) - if concat_over is not None: - raise ValueError( - "`concat_over` is no longer a valid argument to " - "xarray.concat; it has been split into the " - "`data_vars` and `coords` arguments" - ) - if isinstance(first_obj, DataArray): f = _dataarray_concat elif isinstance(first_obj, Dataset): diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index b8ab89e926c..ee99ca027d9 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -163,11 +163,6 @@ def test_concat_errors(self): with raises_regex(ValueError, "coordinate in some datasets but not others"): concat([Dataset({"x": 0}), Dataset({}, {"x": 1})], dim="z") - with raises_regex(ValueError, "no longer a valid"): - concat([data, data], "new_dim", mode="different") - with raises_regex(ValueError, "no longer a valid"): - concat([data, data], "new_dim", concat_over="different") - def test_concat_join_kwarg(self): ds1 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]}) ds2 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]}) From 69c7e01e5167a3137c285cb50d1978252bb8bcbf Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 9 Sep 2019 15:17:33 -0400 Subject: [PATCH 24/43] Compat and encoding deprecation to 0.14 (#3294) * push the removal of the compat and encoding arguments from Dataset/DataArray back to 0.14 * require dim argument to concat * Update whats-new.rst --- doc/whats-new.rst | 4 ++-- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f5e0f9c467f..d81986cb948 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -48,8 +48,8 @@ Breaking changes - Any user code that defines custom subclasses of xarray classes must now explicitly define ``__slots__`` itself. Subclasses that don't add any attributes must state so by defining ``__slots__ = ()`` right after the class header. - Omitting ``__slots__`` will now cause a ``FutureWarning`` to be logged, and a hard - crash in a later release. + Omitting ``__slots__`` will now cause a ``FutureWarning`` to be logged, and will raise an + error in a later release. (:issue:`3250`) by `Guido Imperiale `_. - :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a3655e2c4b2..807baddedf9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -315,7 +315,7 @@ def __init__( if encoding is not None: warnings.warn( "The `encoding` argument to `DataArray` is deprecated, and . " - "will be removed in 0.13. " + "will be removed in 0.14. " "Instead, specify the encoding when writing to disk or " "set the `encoding` attribute directly.", FutureWarning, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1476c1ba646..d6f0da42722 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -489,7 +489,7 @@ def __init__( if compat is not None: warnings.warn( "The `compat` argument to Dataset is deprecated and will be " - "removed in 0.13." + "removed in 0.14." "Instead, use `merge` to control how variables are combined", FutureWarning, stacklevel=2, From 732cf9afb434caeec34a29e91144da4783b6a670 Mon Sep 17 00:00:00 2001 From: Siyu Yang Date: Thu, 12 Sep 2019 19:07:10 -0700 Subject: [PATCH 25/43] Update why-xarray.rst with clearer expression (#3307) in one sentence. --- doc/why-xarray.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/why-xarray.rst b/doc/why-xarray.rst index d0a6c591b29..25d558d99d5 100644 --- a/doc/why-xarray.rst +++ b/doc/why-xarray.rst @@ -62,9 +62,8 @@ The power of the dataset over a plain dictionary is that, in addition to pulling out arrays by name, it is possible to select or combine data along a dimension across all arrays simultaneously. Like a :py:class:`~pandas.DataFrame`, datasets facilitate array operations with -heterogeneous data -- the difference is that the arrays in a dataset can not -only have different data types, but can also have different numbers of -dimensions. +heterogeneous data -- the difference is that the arrays in a dataset can have +not only different data types, but also different numbers of dimensions. This data model is borrowed from the netCDF_ file format, which also provides xarray with a natural and portable serialization format. NetCDF is very popular From e90e8bc06cf8e7c97c7dc4c0e8ff1bf87c49faf6 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Sep 2019 15:39:40 +0000 Subject: [PATCH 26/43] ignore h5py 2.10.0 warnings and fix invalid_netcdf warning test. (#3301) * ignore h5py 2.10.0 warnings and fix invalid_netcdf warning test. * Better fix. * fix fix. * remove comment. * Add docs. * Revert "Add docs." This reverts commit 14ae0b1153f56144c7a90966512f0a156355cf25. --- xarray/tests/test_backends.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a5c42fd368c..f6254b32f4f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2163,6 +2163,7 @@ def test_encoding_unlimited_dims(self): @requires_h5netcdf @requires_netCDF4 +@pytest.mark.filterwarnings("ignore:use make_scale(name) instead") class TestH5NetCDFData(NetCDF4Base): engine = "h5netcdf" @@ -2173,16 +2174,25 @@ def create_store(self): @pytest.mark.filterwarnings("ignore:complex dtypes are supported by h5py") @pytest.mark.parametrize( - "invalid_netcdf, warns, num_warns", + "invalid_netcdf, warntype, num_warns", [(None, FutureWarning, 1), (False, FutureWarning, 1), (True, None, 0)], ) - def test_complex(self, invalid_netcdf, warns, num_warns): + def test_complex(self, invalid_netcdf, warntype, num_warns): expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) save_kwargs = {"invalid_netcdf": invalid_netcdf} - with pytest.warns(warns) as record: + with pytest.warns(warntype) as record: with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: assert_equal(expected, actual) - assert len(record) == num_warns + + recorded_num_warns = 0 + if warntype: + for warning in record: + if issubclass(warning.category, warntype) and ( + "complex dtypes" in str(warning.message) + ): + recorded_num_warns += 1 + + assert recorded_num_warns == num_warns def test_cross_engine_read_write_netcdf4(self): # Drop dim3, because its labels include strings. These appear to be @@ -2451,6 +2461,7 @@ def skip_if_not_engine(engine): @requires_dask +@pytest.mark.filterwarnings("ignore:use make_scale(name) instead") def test_open_mfdataset_manyfiles( readengine, nfiles, parallel, chunks, file_cache_maxsize ): From 7fb3b19d47e81afc5f7ff8506f1daeb3906b0fae Mon Sep 17 00:00:00 2001 From: Gerardo Rivera Date: Sat, 14 Sep 2019 16:46:15 -0500 Subject: [PATCH 27/43] Accept int value in head, thin and tail (#3298) * Accept int value in head, thin and tail * Fix typing * Remove thin def val and add suggestions * Fix typing and change raise message --- xarray/core/dataarray.py | 28 +++++----- xarray/core/dataset.py | 99 +++++++++++++++++++++++++++------- xarray/tests/test_dataarray.py | 35 ++++++++++++ xarray/tests/test_dataset.py | 40 ++++++++++++++ 4 files changed, 171 insertions(+), 31 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 807baddedf9..7937a352cc6 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1035,10 +1035,12 @@ def sel( return self._from_temp_dataset(ds) def head( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "DataArray": """Return a new DataArray whose data is given by the the first `n` - values along the specified dimension(s). + values along the specified dimension(s). Default `n` = 5 See Also -------- @@ -1046,16 +1048,16 @@ def head( DataArray.tail DataArray.thin """ - - indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "head") - ds = self._to_temp_dataset().head(indexers=indexers) + ds = self._to_temp_dataset().head(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) def tail( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "DataArray": """Return a new DataArray whose data is given by the the last `n` - values along the specified dimension(s). + values along the specified dimension(s). Default `n` = 5 See Also -------- @@ -1063,15 +1065,16 @@ def tail( DataArray.head DataArray.thin """ - indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "tail") - ds = self._to_temp_dataset().tail(indexers=indexers) + ds = self._to_temp_dataset().tail(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) def thin( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "DataArray": """Return a new DataArray whose data is given by each `n` value - along the specified dimension(s). + along the specified dimension(s). Default `n` = 5 See Also -------- @@ -1079,8 +1082,7 @@ def thin( DataArray.head DataArray.tail """ - indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "thin") - ds = self._to_temp_dataset().thin(indexers=indexers) + ds = self._to_temp_dataset().thin(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) def broadcast_like( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d6f0da42722..1eeb5350dfe 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2009,15 +2009,18 @@ def sel( return result._overwrite_indexes(new_indexes) def head( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "Dataset": """Returns a new dataset with the first `n` values of each array for the specified dimension(s). Parameters ---------- - indexers : dict, optional - A dict with keys matching dimensions and integer values `n`. + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. One of indexers or indexers_kwargs must be provided. **indexers_kwargs : {dim: n, ...}, optional The keyword arguments form of ``indexers``. @@ -2030,20 +2033,41 @@ def head( Dataset.thin DataArray.head """ + if not indexers_kwargs: + if indexers is None: + indexers = 5 + if not isinstance(indexers, int) and not is_dict_like(indexers): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "head") - indexers = {k: slice(val) for k, val in indexers.items()} - return self.isel(indexers) + for k, v in indexers.items(): + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) + ) + elif v < 0: + raise ValueError( + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) + ) + indexers_slices = {k: slice(val) for k, val in indexers.items()} + return self.isel(indexers_slices) def tail( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "Dataset": """Returns a new dataset with the last `n` values of each array for the specified dimension(s). Parameters ---------- - indexers : dict, optional - A dict with keys matching dimensions and integer values `n`. + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. One of indexers or indexers_kwargs must be provided. **indexers_kwargs : {dim: n, ...}, optional The keyword arguments form of ``indexers``. @@ -2056,24 +2080,44 @@ def tail( Dataset.thin DataArray.tail """ - + if not indexers_kwargs: + if indexers is None: + indexers = 5 + if not isinstance(indexers, int) and not is_dict_like(indexers): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "tail") - indexers = { + for k, v in indexers.items(): + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) + ) + elif v < 0: + raise ValueError( + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) + ) + indexers_slices = { k: slice(-val, None) if val != 0 else slice(val) for k, val in indexers.items() } - return self.isel(indexers) + return self.isel(indexers_slices) def thin( - self, indexers: Mapping[Hashable, Any] = None, **indexers_kwargs: Any + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "Dataset": """Returns a new dataset with each array indexed along every `n`th value for the specified dimension(s) Parameters ---------- - indexers : dict, optional - A dict with keys matching dimensions and integer values `n`. + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. One of indexers or indexers_kwargs must be provided. **indexers_kwargs : {dim: n, ...}, optional The keyword arguments form of ``indexers``. @@ -2086,11 +2130,30 @@ def thin( Dataset.tail DataArray.thin """ + if ( + not indexers_kwargs + and not isinstance(indexers, int) + and not is_dict_like(indexers) + ): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "thin") - if 0 in indexers.values(): - raise ValueError("step cannot be zero") - indexers = {k: slice(None, None, val) for k, val in indexers.items()} - return self.isel(indexers) + for k, v in indexers.items(): + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) + ) + elif v < 0: + raise ValueError( + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) + ) + elif v == 0: + raise ValueError("step cannot be zero") + indexers_slices = {k: slice(None, None, val) for k, val in indexers.items()} + return self.isel(indexers_slices) def broadcast_like( self, other: Union["Dataset", "DataArray"], exclude: Iterable[Hashable] = None diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8c01ef9a68c..78d9ace6be1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1005,13 +1005,48 @@ def test_isel_drop(self): def test_head(self): assert_equal(self.dv.isel(x=slice(5)), self.dv.head(x=5)) assert_equal(self.dv.isel(x=slice(0)), self.dv.head(x=0)) + assert_equal( + self.dv.isel({dim: slice(6) for dim in self.dv.dims}), self.dv.head(6) + ) + assert_equal( + self.dv.isel({dim: slice(5) for dim in self.dv.dims}), self.dv.head() + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.head([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.head(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.head(-3) def test_tail(self): assert_equal(self.dv.isel(x=slice(-5, None)), self.dv.tail(x=5)) assert_equal(self.dv.isel(x=slice(0)), self.dv.tail(x=0)) + assert_equal( + self.dv.isel({dim: slice(-6, None) for dim in self.dv.dims}), + self.dv.tail(6), + ) + assert_equal( + self.dv.isel({dim: slice(-5, None) for dim in self.dv.dims}), self.dv.tail() + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.tail([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.tail(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.tail(-3) def test_thin(self): assert_equal(self.dv.isel(x=slice(None, None, 5)), self.dv.thin(x=5)) + assert_equal( + self.dv.isel({dim: slice(None, None, 6) for dim in self.dv.dims}), + self.dv.thin(6), + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.thin([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.thin(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.thin(-3) with raises_regex(ValueError, "cannot be zero"): self.dv.thin(time=0) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 814fc31d734..d8401e0bd42 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1422,6 +1422,21 @@ def test_head(self): actual = data.head(time=0) assert_equal(expected, actual) + expected = data.isel({dim: slice(6) for dim in data.dims}) + actual = data.head(6) + assert_equal(expected, actual) + + expected = data.isel({dim: slice(5) for dim in data.dims}) + actual = data.head() + assert_equal(expected, actual) + + with raises_regex(TypeError, "either dict-like or a single int"): + data.head([3]) + with raises_regex(TypeError, "expected integer type"): + data.head(dim2=3.1) + with raises_regex(ValueError, "expected positive int"): + data.head(time=-3) + def test_tail(self): data = create_test_data() @@ -1433,6 +1448,21 @@ def test_tail(self): actual = data.tail(dim1=0) assert_equal(expected, actual) + expected = data.isel({dim: slice(-6, None) for dim in data.dims}) + actual = data.tail(6) + assert_equal(expected, actual) + + expected = data.isel({dim: slice(-5, None) for dim in data.dims}) + actual = data.tail() + assert_equal(expected, actual) + + with raises_regex(TypeError, "either dict-like or a single int"): + data.tail([3]) + with raises_regex(TypeError, "expected integer type"): + data.tail(dim2=3.1) + with raises_regex(ValueError, "expected positive int"): + data.tail(time=-3) + def test_thin(self): data = create_test_data() @@ -1440,8 +1470,18 @@ def test_thin(self): actual = data.thin(time=5, dim2=6) assert_equal(expected, actual) + expected = data.isel({dim: slice(None, None, 6) for dim in data.dims}) + actual = data.thin(6) + assert_equal(expected, actual) + + with raises_regex(TypeError, "either dict-like or a single int"): + data.thin([3]) + with raises_regex(TypeError, "expected integer type"): + data.thin(dim2=3.1) with raises_regex(ValueError, "cannot be zero"): data.thin(time=0) + with raises_regex(ValueError, "expected positive int"): + data.thin(time=-3) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sel_fancy(self): From 1ce91051e3751a65dbbbc7c5ff3e1a2f00ea6ee5 Mon Sep 17 00:00:00 2001 From: Gerardo Rivera Date: Sun, 15 Sep 2019 15:27:30 -0500 Subject: [PATCH 28/43] Fix DataArray api doc (#3309) --- doc/api.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index fb6e037a4f2..699687441d7 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -282,9 +282,9 @@ Indexing DataArray.loc DataArray.isel DataArray.sel - Dataset.head - Dataset.tail - Dataset.thin + DataArray.head + DataArray.tail + DataArray.thin DataArray.squeeze DataArray.interp DataArray.interp_like From b65ce8666020ba3a0300154655d2e5c05884d73b Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 16 Sep 2019 00:16:15 +0200 Subject: [PATCH 29/43] Honor `keep_attrs` in DataArray.quantile (#3305) * Added `keep_attrs` argument to Variable.quantile. TestDataArray.test_quantile now checks for attributes in output. * black * updated whats new. * removed vestigial comment. Switched default Variable.quantile keep_attrs to False. --- doc/whats-new.rst | 20 +++++++++++--------- xarray/core/dataset.py | 5 ++++- xarray/core/variable.py | 17 +++++++++++++---- xarray/tests/test_dataarray.py | 4 ++-- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d81986cb948..ab4b17ff16d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,8 +26,8 @@ Breaking changes - The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated since v0.10.0. These are redundant with the ``isel`` / ``sel`` methods. - See :ref:`vectorized_indexing` for the details - By `Maximilian Roos `_ + See :ref:`vectorized_indexing` for the details + By `Maximilian Roos `_ - The ``inplace`` kwarg for public methods now raises an error, having been deprecated since v0.11.0. By `Maximilian Roos `_ @@ -52,12 +52,12 @@ Breaking changes error in a later release. (:issue:`3250`) by `Guido Imperiale `_. -- :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous +- :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous positional arguments were deprecated) - Reindexing with variables of a different dimension now raise an error (previously deprecated) -- :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of +- :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of :py:func:`~xarray.broadcast`) -- :py:meth:`~Variable.expand_dims` is removed (previously deprecated in favor of +- :py:meth:`~Variable.expand_dims` is removed (previously deprecated in favor of :py:meth:`~Variable.set_dims`) New functions/methods @@ -90,7 +90,7 @@ New functions/methods and `Maximilian Roos `_. - Added :py:meth:`DataArray.broadcast_like` and :py:meth:`Dataset.broadcast_like`. - By `Deepak Cherian `_ and `David Mertz + By `Deepak Cherian `_ and `David Mertz `_. - Dataset plotting API for visualizing dependencies between two `DataArray`s! @@ -131,14 +131,14 @@ Enhancements :py:meth:`DataArray.set_index`, as well are more specific error messages when the user passes invalid arguments (:issue:`3176`). By `Gregory Gundersen `_. - + - :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. By `Spencer Jones `_. Bug fixes ~~~~~~~~~ -- Improve "missing dimensions" error message for :py:func:`~xarray.apply_ufunc` - (:issue:`2078`). +- Improve "missing dimensions" error message for :py:func:`~xarray.apply_ufunc` + (:issue:`2078`). By `Rick Russotto `_. - :py:meth:`~xarray.DataArray.assign_coords` now supports dictionary arguments (:issue:`3231`). @@ -170,6 +170,8 @@ Bug fixes dask compute (:issue:`3237`). By `Ulrich Herter `_. - Plots in 2 dimensions (pcolormesh, contour) now allow to specify levels as numpy array (:issue:`3284`). By `Mathias Hauser `_. +- Fixed bug in :meth:`DataArray.quantile` failing to keep attributes when + `keep_attrs` was True (:issue:`3304`). By David Huard `_. .. _whats-new.0.12.3: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1eeb5350dfe..8a53e7ba757 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4831,7 +4831,10 @@ def quantile( # the former is often more efficient reduce_dims = None variables[name] = var.quantile( - q, dim=reduce_dims, interpolation=interpolation + q, + dim=reduce_dims, + interpolation=interpolation, + keep_attrs=keep_attrs, ) else: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2e9906ce5ae..b4b01f7ee49 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1592,7 +1592,7 @@ def no_conflicts(self, other): """ return self.broadcast_equals(other, equiv=duck_array_ops.array_notnull_equiv) - def quantile(self, q, dim=None, interpolation="linear"): + def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements. @@ -1615,6 +1615,10 @@ def quantile(self, q, dim=None, interpolation="linear"): * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the variable's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -1623,7 +1627,7 @@ def quantile(self, q, dim=None, interpolation="linear"): is a scalar. If multiple percentiles are given, first axis of the result corresponds to the quantile and a quantile dimension is added to the return array. The other dimensions are the - dimensions that remain after the reduction of the array. + dimensions that remain after the reduction of the array. See Also -------- @@ -1651,14 +1655,19 @@ def quantile(self, q, dim=None, interpolation="linear"): axis = None new_dims = [] - # only add the quantile dimension if q is array like + # Only add the quantile dimension if q is array-like if q.ndim != 0: new_dims = ["quantile"] + new_dims qs = np.nanpercentile( self.data, q * 100.0, axis=axis, interpolation=interpolation ) - return Variable(new_dims, qs) + + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + attrs = self._attrs if keep_attrs else None + + return Variable(new_dims, qs, attrs) def rank(self, dim, pct=False): """Ranks the data. diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 78d9ace6be1..49980c75b15 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2333,17 +2333,17 @@ def test_reduce_out(self): with pytest.raises(TypeError): orig.mean(out=np.ones(orig.shape)) - # skip due to bug in older versions of numpy.nanpercentile def test_quantile(self): for q in [0.25, [0.50], [0.25, 0.75]]: for axis, dim in zip( [None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]] ): - actual = self.dv.quantile(q, dim=dim) + actual = DataArray(self.va).quantile(q, dim=dim, keep_attrs=True) expected = np.nanpercentile( self.dv.values, np.array(q) * 100, axis=axis ) np.testing.assert_allclose(actual.values, expected) + assert actual.attrs == self.attrs def test_reduce_keep_attrs(self): # Test dropped attrs From 756c94164840e8c070bcd26681b97c31412909ae Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Sep 2019 14:49:27 +0000 Subject: [PATCH 30/43] Refactor concat to use merge for non-concatenated variables (#3239) * Add compat = 'override' and data_vars/coords='sensible' * concat tests. * Update docstring. * Begin merge, combine. * Merge non concatenated variables. * Fix tests. * Fix tests 2 * Fix test 3 * Cleanup: reduce number of times we loop over datasets. * unique_variable does minimum number of loads: fixes dask test * docstrings for compat='override' * concat compat docstring. * remove the sensible option. * reduce silly changes. * fix groupby order test. * cleanup: var names + remove one loop through datasets. * Add whats-new entry. * Add note in io.rst * fix warning. * Update netcdf multi-file dataset section in io.rst. * Update mfdataset in dask.rst. * simplify parse_datasets. * Avoid using merge_variables. unique_variable instead. * small stuff. * Update docs. * minor fix. * minor fix. * lint. * Better error message. * rename to shorter variable names. * Cleanup: fillna preserves attrs now. * Look for concat dim in data_vars also. * Update xarray/core/merge.py Co-Authored-By: Stephan Hoyer * avoid unnecessary computes. * minor cleanups. --- doc/dask.rst | 7 +- doc/io.rst | 244 +++++++++++++++++++++-------------- doc/whats-new.rst | 29 ++++- xarray/backends/api.py | 3 +- xarray/core/combine.py | 20 ++- xarray/core/concat.py | 190 +++++++++++++++++---------- xarray/core/dataarray.py | 4 +- xarray/core/merge.py | 62 +++++---- xarray/tests/test_combine.py | 13 +- xarray/tests/test_concat.py | 49 ++++++- xarray/tests/test_dask.py | 1 - xarray/tests/test_merge.py | 2 + 12 files changed, 402 insertions(+), 222 deletions(-) diff --git a/doc/dask.rst b/doc/dask.rst index adf0a6bf585..19cbc11292c 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -75,13 +75,14 @@ entirely equivalent to opening a dataset using ``open_dataset`` and then chunking the data using the ``chunk`` method, e.g., ``xr.open_dataset('example-data.nc').chunk({'time': 10})``. -To open multiple files simultaneously, use :py:func:`~xarray.open_mfdataset`:: +To open multiple files simultaneously in parallel using Dask delayed, +use :py:func:`~xarray.open_mfdataset`:: - xr.open_mfdataset('my/files/*.nc') + xr.open_mfdataset('my/files/*.nc', parallel=True) This function will automatically concatenate and merge dataset into one in the simple cases that it understands (see :py:func:`~xarray.auto_combine` -for the full disclaimer). By default, ``open_mfdataset`` will chunk each +for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can open each file individually using ``open_dataset`` and merge the result, as diff --git a/doc/io.rst b/doc/io.rst index f7ac8c095b9..775d915188e 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -99,7 +99,9 @@ netCDF The recommended way to store xarray data structures is `netCDF`__, which is a binary file format for self-described datasets that originated in the geosciences. xarray is based on the netCDF data model, so netCDF files -on disk directly correspond to :py:class:`~xarray.Dataset` objects. +on disk directly correspond to :py:class:`~xarray.Dataset` objects (more accurately, +a group in a netCDF file directly corresponds to a to :py:class:`~xarray.Dataset` object. +See :ref:`io.netcdf_groups` for more.) NetCDF is supported on almost all platforms, and parsers exist for the vast majority of scientific programming languages. Recent versions of @@ -121,7 +123,7 @@ read/write netCDF V4 files and use the compression options described below). __ https://github.com/Unidata/netcdf4-python We can save a Dataset to disk using the -:py:attr:`Dataset.to_netcdf ` method: +:py:meth:`~Dataset.to_netcdf` method: .. ipython:: python @@ -147,19 +149,6 @@ convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back when loading, ensuring that the ``DataArray`` that is loaded is always exactly the same as the one that was saved. -NetCDF groups are not supported as part of the -:py:class:`~xarray.Dataset` data model. Instead, groups can be loaded -individually as Dataset objects. -To do so, pass a ``group`` keyword argument to the -``open_dataset`` function. The group can be specified as a path-like -string, e.g., to access subgroup 'bar' within group 'foo' pass -'/foo/bar' as the ``group`` argument. -In a similar way, the ``group`` keyword argument can be given to the -:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group -in a netCDF file. -When writing multiple groups in one file, pass ``mode='a'`` to ``to_netcdf`` -to ensure that each call does not delete the file. - Data is always loaded lazily from netCDF files. You can manipulate, slice and subset Dataset and DataArray objects, and no array values are loaded into memory until you try to perform some sort of actual computation. For an example of how these @@ -195,6 +184,24 @@ It is possible to append or overwrite netCDF variables using the ``mode='a'`` argument. When using this option, all variables in the dataset will be written to the original netCDF file, regardless if they exist in the original dataset. + +.. _io.netcdf_groups: + +Groups +~~~~~~ + +NetCDF groups are not supported as part of the :py:class:`~xarray.Dataset` data model. +Instead, groups can be loaded individually as Dataset objects. +To do so, pass a ``group`` keyword argument to the +:py:func:`~xarray.open_dataset` function. The group can be specified as a path-like +string, e.g., to access subgroup ``'bar'`` within group ``'foo'`` pass +``'/foo/bar'`` as the ``group`` argument. +In a similar way, the ``group`` keyword argument can be given to the +:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group +in a netCDF file. +When writing multiple groups in one file, pass ``mode='a'`` to +:py:meth:`~xarray.Dataset.to_netcdf` to ensure that each call does not delete the file. + .. _io.encoding: Reading encoded data @@ -203,7 +210,7 @@ Reading encoded data NetCDF files follow some conventions for encoding datetime arrays (as numbers with a "units" attribute) and for packing and unpacking data (as described by the "scale_factor" and "add_offset" attributes). If the argument -``decode_cf=True`` (default) is given to ``open_dataset``, xarray will attempt +``decode_cf=True`` (default) is given to :py:func:`~xarray.open_dataset`, xarray will attempt to automatically decode the values in the netCDF objects according to `CF conventions`_. Sometimes this will fail, for example, if a variable has an invalid "units" or "calendar" attribute. For these cases, you can @@ -247,6 +254,130 @@ will remove encoding information. import os os.remove('saved_on_disk.nc') + +.. _combining multiple files: + +Reading multi-file datasets +........................... + +NetCDF files are often encountered in collections, e.g., with different files +corresponding to different model runs or one file per timestamp. +xarray can straightforwardly combine such files into a single Dataset by making use of +:py:func:`~xarray.concat`, :py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and +:py:func:`~xarray.combine_by_coords`. For details on the difference between these +functions see :ref:`combining data`. + +Xarray includes support for manipulating datasets that don't fit into memory +with dask_. If you have dask installed, you can open multiple files +simultaneously in parallel using :py:func:`~xarray.open_mfdataset`:: + + xr.open_mfdataset('my/files/*.nc', parallel=True) + +This function automatically concatenates and merges multiple files into a +single xarray dataset. +It is the recommended way to open multiple files with xarray. +For more details on parallel reading, see :ref:`combining.multi`, :ref:`dask.io` and a +`blog post`_ by Stephan Hoyer. +:py:func:`~xarray.open_mfdataset` takes many kwargs that allow you to +control its behaviour (for e.g. ``parallel``, ``combine``, ``compat``, ``join``, ``concat_dim``). +See its docstring for more details. + + +.. note:: + + A common use-case involves a dataset distributed across a large number of files with + each file containing a large number of variables. Commonly a few of these variables + need to be concatenated along a dimension (say ``"time"``), while the rest are equal + across the datasets (ignoring floating point differences). The following command + with suitable modifications (such as ``parallel=True``) works well with such datasets:: + + xr.open_mfdataset('my/files/*.nc', concat_dim="time", + data_vars='minimal', coords='minimal', compat='override') + + This command concatenates variables along the ``"time"`` dimension, but only those that + already contain the ``"time"`` dimension (``data_vars='minimal', coords='minimal'``). + Variables that lack the ``"time"`` dimension are taken from the first dataset + (``compat='override'``). + + +.. _dask: http://dask.pydata.org +.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ + +Sometimes multi-file datasets are not conveniently organized for easy use of :py:func:`~xarray.open_mfdataset`. +One can use the ``preprocess`` argument to provide a function that takes a dataset +and returns a modified Dataset. +:py:func:`~xarray.open_mfdataset` will call ``preprocess`` on every dataset +(corresponding to each file) prior to combining them. + + +If :py:func:`~xarray.open_mfdataset` does not meet your needs, other approaches are possible. +The general pattern for parallel reading of multiple files +using dask, modifying those datasets and then combining into a single ``Dataset`` is:: + + def modify(ds): + # modify ds here + return ds + + + # this is basically what open_mfdataset does + open_kwargs = dict(decode_cf=True, decode_times=False) + open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] + tasks = [dask.delayed(modify)(task) for task in open_tasks] + datasets = dask.compute(tasks) # get a list of xarray.Datasets + combined = xr.combine_nested(datasets) # or some combination of concat, merge + + +As an example, here's how we could approximate ``MFDataset`` from the netCDF4 +library:: + + from glob import glob + import xarray as xr + + def read_netcdfs(files, dim): + # glob expands paths with * to a list of files, like the unix shell + paths = sorted(glob(files)) + datasets = [xr.open_dataset(p) for p in paths] + combined = xr.concat(dataset, dim) + return combined + + combined = read_netcdfs('/all/my/files/*.nc', dim='time') + +This function will work in many cases, but it's not very robust. First, it +never closes files, which means it will fail one you need to load more than +a few thousands file. Second, it assumes that you want all the data from each +file and that it can all fit into memory. In many situations, you only need +a small subset or an aggregated summary of the data from each file. + +Here's a slightly more sophisticated example of how to remedy these +deficiencies:: + + def read_netcdfs(files, dim, transform_func=None): + def process_one_path(path): + # use a context manager, to ensure the file gets closed after use + with xr.open_dataset(path) as ds: + # transform_func should do some sort of selection or + # aggregation + if transform_func is not None: + ds = transform_func(ds) + # load all data from the transformed dataset, to ensure we can + # use it after closing each original file + ds.load() + return ds + + paths = sorted(glob(files)) + datasets = [process_one_path(p) for p in paths] + combined = xr.concat(datasets, dim) + return combined + + # here we suppose we only care about the combined mean of each file; + # you might also use indexing operations like .sel to subset datasets + combined = read_netcdfs('/all/my/files/*.nc', dim='time', + transform_func=lambda ds: ds.mean()) + +This pattern works well and is very robust. We've used similar code to process +tens of thousands of files constituting 100s of GB of data. + + .. _io.netcdf.writing_encoded: Writing encoded data @@ -817,84 +948,3 @@ For CSV files, one might also consider `xarray_extras`_. .. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html - - -.. _combining multiple files: - - -Combining multiple files ------------------------- - -NetCDF files are often encountered in collections, e.g., with different files -corresponding to different model runs. xarray can straightforwardly combine such -files into a single Dataset by making use of :py:func:`~xarray.concat`, -:py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and -:py:func:`~xarray.combine_by_coords`. For details on the difference between these -functions see :ref:`combining data`. - -.. note:: - - Xarray includes support for manipulating datasets that don't fit into memory - with dask_. If you have dask installed, you can open multiple files - simultaneously using :py:func:`~xarray.open_mfdataset`:: - - xr.open_mfdataset('my/files/*.nc') - - This function automatically concatenates and merges multiple files into a - single xarray dataset. - It is the recommended way to open multiple files with xarray. - For more details, see :ref:`combining.multi`, :ref:`dask.io` and a - `blog post`_ by Stephan Hoyer. - -.. _dask: http://dask.pydata.org -.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ - -For example, here's how we could approximate ``MFDataset`` from the netCDF4 -library:: - - from glob import glob - import xarray as xr - - def read_netcdfs(files, dim): - # glob expands paths with * to a list of files, like the unix shell - paths = sorted(glob(files)) - datasets = [xr.open_dataset(p) for p in paths] - combined = xr.concat(dataset, dim) - return combined - - combined = read_netcdfs('/all/my/files/*.nc', dim='time') - -This function will work in many cases, but it's not very robust. First, it -never closes files, which means it will fail one you need to load more than -a few thousands file. Second, it assumes that you want all the data from each -file and that it can all fit into memory. In many situations, you only need -a small subset or an aggregated summary of the data from each file. - -Here's a slightly more sophisticated example of how to remedy these -deficiencies:: - - def read_netcdfs(files, dim, transform_func=None): - def process_one_path(path): - # use a context manager, to ensure the file gets closed after use - with xr.open_dataset(path) as ds: - # transform_func should do some sort of selection or - # aggregation - if transform_func is not None: - ds = transform_func(ds) - # load all data from the transformed dataset, to ensure we can - # use it after closing each original file - ds.load() - return ds - - paths = sorted(glob(files)) - datasets = [process_one_path(p) for p in paths] - combined = xr.concat(datasets, dim) - return combined - - # here we suppose we only care about the combined mean of each file; - # you might also use indexing operations like .sel to subset datasets - combined = read_netcdfs('/all/my/files/*.nc', dim='time', - transform_func=lambda ds: ds.mean()) - -This pattern works well and is very robust. We've used similar code to process -tens of thousands of files constituting 100s of GB of data. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ab4b17ff16d..492c9279e6b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -93,7 +93,7 @@ New functions/methods By `Deepak Cherian `_ and `David Mertz `_. -- Dataset plotting API for visualizing dependencies between two `DataArray`s! +- Dataset plotting API for visualizing dependencies between two DataArrays! Currently only :py:meth:`Dataset.plot.scatter` is implemented. By `Yohai Bar Sinai `_ and `Deepak Cherian `_ @@ -103,11 +103,30 @@ New functions/methods Enhancements ~~~~~~~~~~~~ -- Added ``join='override'``. This only checks that index sizes are equal among objects and skips - checking indexes for equality. By `Deepak Cherian `_. +- Multiple enhancements to :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset`. -- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. - It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian `_. + - Added ``compat='override'``. When merging, this option picks the variable from the first dataset + and skips all comparisons. + + - Added ``join='override'``. When aligning, this only checks that index sizes are equal among objects + and skips checking indexes for equality. + + - :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. + It is passed down to :py:func:`~xarray.align`. + + - :py:func:`~xarray.concat` now calls :py:func:`~xarray.merge` on variables that are not concatenated + (i.e. variables without ``concat_dim`` when ``data_vars`` or ``coords`` are ``"minimal"``). + :py:func:`~xarray.concat` passes its new ``compat`` kwarg down to :py:func:`~xarray.merge`. + (:issue:`2064`) + + Users can avoid a common bottleneck when using :py:func:`~xarray.open_mfdataset` on a large number of + files with variables that are known to be aligned and some of which need not be concatenated. + Slow equality comparisons can now be avoided, for e.g.:: + + data = xr.open_mfdataset(files, concat_dim='time', data_vars='minimal', + coords='minimal', compat='override', join='override') + + By `Deepak Cherian `_: - In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if ``append_dim`` is set, as it will automatically be set to ``'a'`` internally. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a20d3c2a306..1f0869cfc53 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -761,7 +761,7 @@ def open_mfdataset( `xarray.auto_combine` is used, but in the future this behavior will switch to use `xarray.combine_by_coords` by default. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: * 'broadcast_equals': all values must be equal when variables are @@ -772,6 +772,7 @@ def open_mfdataset( * 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + * 'override': skip comparing and pick variable from first dataset preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c24be88b19e..e35bb51e030 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -243,6 +243,7 @@ def _combine_1d( dim=concat_dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -351,7 +352,7 @@ def combine_nested( Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential merge conflicts: @@ -363,6 +364,7 @@ def combine_nested( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -504,7 +506,7 @@ def combine_by_coords( datasets : sequence of xarray.Dataset Dataset objects to combine. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -516,6 +518,7 @@ def combine_by_coords( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -598,6 +601,7 @@ def combine_by_coords( concat_dims=concat_dims, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -667,7 +671,7 @@ def auto_combine( component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: - 'broadcast_equals': all values must be equal when variables are @@ -678,6 +682,7 @@ def auto_combine( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' o list of str}, optional @@ -832,6 +837,7 @@ def _old_auto_combine( dim=dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -850,6 +856,7 @@ def _auto_concat( coords="different", fill_value=dtypes.NA, join="outer", + compat="no_conflicts", ): if len(datasets) == 1 and dim is None: # There is nothing more to combine, so kick out early. @@ -876,5 +883,10 @@ def _auto_concat( ) dim, = concat_dims return concat( - datasets, dim=dim, data_vars=data_vars, coords=coords, fill_value=fill_value + datasets, + dim=dim, + data_vars=data_vars, + coords=coords, + fill_value=fill_value, + compat=compat, ) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d5dfa49a8d5..e68c247d880 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -4,6 +4,7 @@ from . import dtypes, utils from .alignment import align +from .merge import unique_variable, _VALID_COMPAT from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -59,12 +60,19 @@ def concat( those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition to the 'minimal' coordinates. - compat : {'equals', 'identical'}, optional - String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. 'equals' means - that all variable values and dimensions must be the same; - 'identical' means that variable attributes and global attributes - must also be equal. + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional + String indicating how to compare non-concatenated variables of the same name for + potential conflicts. This is passed down to merge. + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + - 'override': skip comparing and pick variable from first dataset positions : None or list of integer arrays, optional List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not @@ -107,6 +115,12 @@ def concat( except StopIteration: raise ValueError("must supply at least one object to concatenate") + if compat not in _VALID_COMPAT: + raise ValueError( + "compat=%r invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" + % compat + ) + if isinstance(first_obj, DataArray): f = _dataarray_concat elif isinstance(first_obj, Dataset): @@ -143,23 +157,39 @@ def _calc_concat_dim_coord(dim): return dim, coord -def _calc_concat_over(datasets, dim, data_vars, coords): +def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): """ Determine which dataset variables need to be concatenated in the result, - and which can simply be taken from the first dataset. """ # Return values concat_over = set() equals = {} - if dim in datasets[0]: + if dim in dim_names: + concat_over_existing_dim = True concat_over.add(dim) + else: + concat_over_existing_dim = False + + concat_dim_lengths = [] for ds in datasets: + if concat_over_existing_dim: + if dim not in ds.dims: + if dim in ds: + ds = ds.set_coords(dim) + else: + raise ValueError("%r is not present in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) + concat_dim_lengths.append(ds.dims.get(dim, 1)) def process_subset_opt(opt, subset): if isinstance(opt, str): if opt == "different": + if compat == "override": + raise ValueError( + "Cannot specify both %s='different' and compat='override'." + % subset + ) # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): if k not in concat_over: @@ -173,7 +203,7 @@ def process_subset_opt(opt, subset): for ds_rhs in datasets[1:]: v_rhs = ds_rhs.variables[k].compute() computed.append(v_rhs) - if not v_lhs.equals(v_rhs): + if not getattr(v_lhs, compat)(v_rhs): concat_over.add(k) equals[k] = False # computed variables are not to be re-computed @@ -209,7 +239,29 @@ def process_subset_opt(opt, subset): process_subset_opt(data_vars, "data_vars") process_subset_opt(coords, "coords") - return concat_over, equals + return concat_over, equals, concat_dim_lengths + + +# determine dimensional coordinate names and a dict mapping name to DataArray +def _parse_datasets(datasets): + + dims = set() + all_coord_names = set() + data_vars = set() # list of data_vars + dim_coords = dict() # maps dim name to variable + dims_sizes = {} # shared dimension sizes to expand variables + + for ds in datasets: + dims_sizes.update(ds.dims) + all_coord_names.update(ds.coords) + data_vars.update(ds.data_vars) + + for dim in set(ds.dims) - dims: + if dim not in dim_coords: + dim_coords[dim] = ds.coords[dim].variable + dims = dims | set(ds.dims) + + return dim_coords, dims_sizes, all_coord_names, data_vars def _dataset_concat( @@ -227,11 +279,6 @@ def _dataset_concat( """ from .dataset import Dataset - if compat not in ["equals", "identical"]: - raise ValueError( - "compat=%r invalid: must be 'equals' " "or 'identical'" % compat - ) - dim, coord = _calc_concat_dim_coord(dim) # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] @@ -239,62 +286,65 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) + dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) + dim_names = set(dim_coords) + unlabeled_dims = dim_names - coord_names + + both_data_and_coords = coord_names & data_names + if both_data_and_coords: + raise ValueError( + "%r is a coordinate in some datasets but not others." % both_data_and_coords + ) + # we don't want the concat dimension in the result dataset yet + dim_coords.pop(dim, None) + dims_sizes.pop(dim, None) + + # case where concat dimension is a coordinate or data_var but not a dimension + if (dim in coord_names or dim in data_names) and dim not in dim_names: + datasets = [ds.expand_dims(dim) for ds in datasets] + + # determine which variables to concatentate + concat_over, equals, concat_dim_lengths = _calc_concat_over( + datasets, dim, dim_names, data_vars, coords, compat + ) + + # determine which variables to merge, and then merge them according to compat + variables_to_merge = (coord_names | data_names) - concat_over - dim_names + + result_vars = {} + if variables_to_merge: + to_merge = {var: [] for var in variables_to_merge} + + for ds in datasets: + absent_merge_vars = variables_to_merge - set(ds.variables) + if absent_merge_vars: + raise ValueError( + "variables %r are present in some datasets but not others. " + % absent_merge_vars + ) - def insert_result_variable(k, v): - assert isinstance(v, Variable) - if k in datasets[0].coords: - result_coord_names.add(k) - result_vars[k] = v + for var in variables_to_merge: + to_merge[var].append(ds.variables[var]) - # create the new dataset and add constant variables - result_vars = OrderedDict() - result_coord_names = set(datasets[0].coords) + for var in variables_to_merge: + result_vars[var] = unique_variable( + var, to_merge[var], compat=compat, equals=equals.get(var, None) + ) + else: + result_vars = OrderedDict() + result_vars.update(dim_coords) + + # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding - for k, v in datasets[0].variables.items(): - if k not in concat_over: - insert_result_variable(k, v) - - # check that global attributes and non-concatenated variables are fixed - # across all datasets + # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): - raise ValueError("dataset global attributes not equal") - for k, v in ds.variables.items(): - if k not in result_vars and k not in concat_over: - raise ValueError("encountered unexpected variable %r" % k) - elif (k in result_coord_names) != (k in ds.coords): - raise ValueError( - "%r is a coordinate in some datasets but not " "others" % k - ) - elif k in result_vars and k != dim: - # Don't use Variable.identical as it internally invokes - # Variable.equals, and we may already know the answer - if compat == "identical" and not utils.dict_equiv( - v.attrs, result_vars[k].attrs - ): - raise ValueError("variable %s not identical across datasets" % k) - - # Proceed with equals() - try: - # May be populated when using the "different" method - is_equal = equals[k] - except KeyError: - result_vars[k].load() - is_equal = v.equals(result_vars[k]) - if not is_equal: - raise ValueError("variable %s not equal across datasets" % k) + raise ValueError("Dataset global attributes not equal.") # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables - dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] - non_concat_dims = {} - for ds in datasets: - non_concat_dims.update(ds.dims) - non_concat_dims.pop(dim, None) - def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the @@ -302,25 +352,27 @@ def ensure_common_dims(vars): common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) if dim not in common_dims: common_dims = (dim,) + common_dims - for var, dim_len in zip(vars, dim_lengths): + for var, dim_len in zip(vars, concat_dim_lengths): if var.dims != common_dims: - common_shape = tuple( - non_concat_dims.get(d, dim_len) for d in common_dims - ) + common_shape = tuple(dims_sizes.get(d, dim_len) for d in common_dims) var = var.set_dims(common_dims, common_shape) yield var # stack up each variable to fill-out the dataset (in order) + # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: vars = ensure_common_dims([ds.variables[k] for ds in datasets]) combined = concat_vars(vars, dim, positions) - insert_result_variable(k, combined) + assert isinstance(combined, Variable) + result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) + result = result.set_coords(coord_names) result.encoding = result_encoding + result = result.drop(unlabeled_dims, errors="ignore") + if coord is not None: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord @@ -342,7 +394,7 @@ def _dataarray_concat( if data_vars != "all": raise ValueError( - "data_vars is not a valid argument when " "concatenating DataArray objects" + "data_vars is not a valid argument when concatenating DataArray objects" ) datasets = [] diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7937a352cc6..d9e98839419 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1551,8 +1551,8 @@ def set_index( obj : DataArray Another DataArray, with this data but replaced coordinates. - Example - ------- + Examples + -------- >>> arr = xr.DataArray(data=np.ones((2, 3)), ... dims=['x', 'y'], ... coords={'x': diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 225507b9204..6dba659f992 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -44,6 +44,7 @@ "broadcast_equals": 2, "minimal": 3, "no_conflicts": 4, + "override": 5, } ) @@ -70,8 +71,8 @@ class MergeError(ValueError): # TODO: move this to an xarray.exceptions module? -def unique_variable(name, variables, compat="broadcast_equals"): - # type: (Any, List[Variable], str) -> Variable +def unique_variable(name, variables, compat="broadcast_equals", equals=None): + # type: (Any, List[Variable], str, bool) -> Variable """Return the unique variable from a list of variables or raise MergeError. Parameters @@ -81,8 +82,10 @@ def unique_variable(name, variables, compat="broadcast_equals"): variables : list of xarray.Variable List of Variable objects, all of which go by the same name in different inputs. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Type of equality check to use. + equals: None or bool, + corresponding to result of compat test Returns ------- @@ -93,30 +96,38 @@ def unique_variable(name, variables, compat="broadcast_equals"): MergeError: if any of the variables are not equal. """ # noqa out = variables[0] - if len(variables) > 1: - combine_method = None - if compat == "minimal": - compat = "broadcast_equals" + if len(variables) == 1 or compat == "override": + return out + + combine_method = None + + if compat == "minimal": + compat = "broadcast_equals" + + if compat == "broadcast_equals": + dim_lengths = broadcast_dimension_size(variables) + out = out.set_dims(dim_lengths) + + if compat == "no_conflicts": + combine_method = "fillna" - if compat == "broadcast_equals": - dim_lengths = broadcast_dimension_size(variables) - out = out.set_dims(dim_lengths) + if equals is None: + out = out.compute() + for var in variables[1:]: + equals = getattr(out, compat)(var) + if not equals: + break - if compat == "no_conflicts": - combine_method = "fillna" + if not equals: + raise MergeError( + "conflicting values for variable %r on objects to be combined. You can skip this check by specifying compat='override'." + % (name) + ) + if combine_method: for var in variables[1:]: - if not getattr(out, compat)(var): - raise MergeError( - "conflicting values for variable %r on " - "objects to be combined:\n" - "first value: %r\nsecond value: %r" % (name, out, var) - ) - if combine_method: - # TODO: add preservation of attrs into fillna - out = getattr(out, combine_method)(var) - out.attrs = var.attrs + out = getattr(out, combine_method)(var) return out @@ -152,7 +163,7 @@ def merge_variables( priority_vars : mapping with Variable or None values, optional If provided, variables are always taken from this dict in preference to the input variable dictionaries, without checking for conflicts. - compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts', 'override'}, optional Type of equality check to use when checking for conflicts. Returns @@ -449,7 +460,7 @@ def merge_core( ---------- objs : list of mappings All values must be convertable to labeled arrays. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Compatibility checks to use when merging variables. join : {'outer', 'inner', 'left', 'right'}, optional How to combine objects with different indexes. @@ -519,7 +530,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]] Merge together all variables from these objects. If any of them are DataArray objects, they must have a name. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -531,6 +542,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes in objects. diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f786a851e62..1abca30d199 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -327,13 +327,13 @@ class TestCheckShapeTileIDs: def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent depths"): + with raises_regex(ValueError, "sub-lists do not have consistent depths"): _check_shape_tile_ids(combined_tile_ids) def test_check_lengths(self): ds = create_test_data(0) combined_tile_ids = {(0, 0): ds, (0, 1): ds, (0, 2): ds, (1, 0): ds, (1, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent lengths"): + with raises_regex(ValueError, "sub-lists do not have consistent lengths"): _check_shape_tile_ids(combined_tile_ids) @@ -565,11 +565,6 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({"x": [0]}) assert_identical(expected, actual) - def test_combine_nested_but_need_auto_combine(self): - objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2], "wall": [0]})] - with raises_regex(ValueError, "cannot be combined"): - combine_nested(objs, concat_dim="x") - @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_combine_nested_fill_value(self, fill_value): datasets = [ @@ -618,7 +613,7 @@ def test_combine_by_coords(self): assert_equal(actual, expected) objs = [Dataset({"x": 0}), Dataset({"x": 1})] - with raises_regex(ValueError, "Could not find any dimension " "coordinates"): + with raises_regex(ValueError, "Could not find any dimension coordinates"): combine_by_coords(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] @@ -761,7 +756,7 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with pytest.raises(KeyError): + with raises_regex(ValueError, "'y' is not present in all datasets"): auto_combine(objs) def test_auto_combine_previously_failed(self): diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index ee99ca027d9..00428f70966 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -5,8 +5,7 @@ import pytest from xarray import DataArray, Dataset, Variable, concat -from xarray.core import dtypes - +from xarray.core import dtypes, merge from . import ( InaccessibleArray, assert_array_equal, @@ -18,6 +17,34 @@ from .test_dataset import create_test_data +def test_concat_compat(): + ds1 = Dataset( + { + "has_x_y": (("y", "x"), [[1, 2]]), + "has_x": ("x", [1, 2]), + "no_x_y": ("z", [1, 2]), + }, + coords={"x": [0, 1], "y": [0], "z": [-1, -2]}, + ) + ds2 = Dataset( + { + "has_x_y": (("y", "x"), [[3, 4]]), + "has_x": ("x", [1, 2]), + "no_x_y": (("q", "z"), [[1, 2]]), + }, + coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, + ) + + result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") + assert_equal(ds2.no_x_y, result.no_x_y.transpose()) + + for var in ["has_x", "no_x_y"]: + assert "y" not in result[var] + + with raises_regex(ValueError, "'q' is not present in all datasets"): + concat([ds1, ds2], dim="q", data_vars="all", compat="broadcast_equals") + + class TestConcatDataset: @pytest.fixture def data(self): @@ -92,7 +119,7 @@ def test_concat_coords(self): actual = concat(objs, dim="x", coords=coords) assert_identical(expected, actual) for coords in ["minimal", []]: - with raises_regex(ValueError, "not equal across"): + with raises_regex(merge.MergeError, "conflicting values"): concat(objs, dim="x", coords=coords) def test_concat_constant_index(self): @@ -103,8 +130,10 @@ def test_concat_constant_index(self): for mode in ["different", "all", ["foo"]]: actual = concat([ds1, ds2], "y", data_vars=mode) assert_identical(expected, actual) - with raises_regex(ValueError, "not equal across datasets"): - concat([ds1, ds2], "y", data_vars="minimal") + with raises_regex(merge.MergeError, "conflicting values"): + # previously dim="y", and raised error which makes no sense. + # "foo" has dimension "y" so minimal should concatenate it? + concat([ds1, ds2], "new_dim", data_vars="minimal") def test_concat_size0(self): data = create_test_data() @@ -134,6 +163,14 @@ def test_concat_errors(self): data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] + with raises_regex(ValueError, "must supply at least one"): + concat([], "dim1") + + with raises_regex(ValueError, "Cannot specify both .*='different'"): + concat( + [data, data], dim="concat_dim", data_vars="different", compat="override" + ) + with raises_regex(ValueError, "must supply at least one"): concat([], "dim1") @@ -146,7 +183,7 @@ def test_concat_errors(self): concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) - with raises_regex(ValueError, "encountered unexpected"): + with raises_regex(ValueError, "present in some datasets"): data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) concat([data0, data1], "dim1") diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index d105765481e..76b3ed1a8d6 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -825,7 +825,6 @@ def kernel(name): """Dask kernel to test pickling/unpickling and __repr__. Must be global to make it pickleable. """ - print("kernel(%s)" % name) global kernel_call_count kernel_call_count += 1 return np.ones(1, dtype=np.int64) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index ed1453ce95d..c1e6c7a5ce8 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -196,6 +196,8 @@ def test_merge_compat(self): with raises_regex(ValueError, "compat=.* invalid"): ds1.merge(ds2, compat="foobar") + assert ds1.identical(ds1.merge(ds2, compat="override")) + def test_merge_auto_align(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) From d087fc58c40be0490151cb011802a609a774aaba Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Sep 2019 14:49:31 +0000 Subject: [PATCH 31/43] Raise error if cmap is list of colors (#3310) * Raise error if cmap is list of colors * whats-new.rst --- doc/whats-new.rst | 2 ++ xarray/plot/utils.py | 6 ++---- xarray/tests/test_plot.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 492c9279e6b..567e74052d5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,8 @@ Breaking changes - :py:func:`~xarray.concat` now requires the ``dim`` argument. Its ``indexers``, ``mode`` and ``concat_over`` kwargs have now been removed. By `Deepak Cherian `_ +- Passing a list of colors in ``cmap`` will now raise an error, having been deprecated since + v0.6.1. - Most xarray objects now define ``__slots__``. This reduces overall RAM usage by ~22% (not counting the underlying numpy buffers); on CPython 3.7/x64, a trivial DataArray has gone down from 1.9kB to 1.5kB. diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 53bbe8bacb9..f69a8af7a2f 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -737,11 +737,9 @@ def _process_cmap_cbar_kwargs( # we should not be getting a list of colors in cmap anymore # is there a better way to do this test? if isinstance(cmap, (list, tuple)): - warnings.warn( + raise ValueError( "Specifying a list of colors in cmap is deprecated. " - "Use colors keyword instead.", - DeprecationWarning, - stacklevel=3, + "Use colors keyword instead." ) cmap_kwargs = { diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index c9b041b3ba7..020a49b0114 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -1320,8 +1320,8 @@ def test_cmap_and_color_both(self): with pytest.raises(ValueError): self.plotmethod(colors="k", cmap="RdBu") - def list_of_colors_in_cmap_deprecated(self): - with pytest.raises(Exception): + def list_of_colors_in_cmap_raises_error(self): + with raises_regex(ValueError, "list of colors"): self.plotmethod(cmap=["k", "b"]) @pytest.mark.slow From 99a5adc1ff65def531ccd55ca5a653dcd5bb4b47 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Sep 2019 17:22:57 +0000 Subject: [PATCH 32/43] Deprecation: groupby, resample default dim. (#3313) * Deprecation: groupby, resample default dim. * fix whats-new * found another test to fix. --- doc/whats-new.rst | 3 + xarray/core/dataset.py | 6 +- xarray/core/groupby.py | 109 ++------------------------------- xarray/core/resample.py | 5 +- xarray/tests/test_dataarray.py | 12 +--- xarray/tests/test_dataset.py | 12 ---- xarray/tests/test_groupby.py | 10 +-- 7 files changed, 17 insertions(+), 140 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 567e74052d5..57da2910d6e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -54,6 +54,9 @@ Breaking changes error in a later release. (:issue:`3250`) by `Guido Imperiale `_. +- The default dimension for :py:meth:`~xarray.Dataset.groupby`, :py:meth:`~xarray.Dataset.resample`, + :py:meth:`~xarray.DataArray.groupby` and :py:meth:`~xarray.DataArray.resample` reductions is now the + grouping or resampling dimension. - :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous positional arguments were deprecated) - Reindexing with variables of a different dimension now raise an error (previously deprecated) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8a53e7ba757..693e94e22dd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3875,9 +3875,7 @@ def reduce( Dataset with this object's DataArrays replaced with new DataArrays of summarized data and the indicated dimension(s) removed. """ - if dim is ALL_DIMS: - dim = None - if dim is None: + if dim is None or dim is ALL_DIMS: dims = set(self.dims) elif isinstance(dim, str) or not isinstance(dim, Iterable): dims = {dim} @@ -4803,7 +4801,7 @@ def quantile( if isinstance(dim, str): dims = {dim} - elif dim is None: + elif dim is None or dim is ALL_DIMS: dims = set(self.dims) else: dims = set(dim) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 41de4846e81..bae3057aabe 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -5,9 +5,9 @@ import numpy as np import pandas as pd -from . import dtypes, duck_array_ops, nputils, ops, utils +from . import dtypes, duck_array_ops, nputils, ops from .arithmetic import SupportsArithmetic -from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce +from .common import ImplementsArrayReduce, ImplementsDatasetReduce from .concat import concat from .options import _get_keep_attrs from .pycompat import integer_types @@ -700,19 +700,8 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, DataArray.quantile """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process - if self._obj.ndim > 1: - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) + if dim is None: + dim = self._group_dim out = self.apply( self._obj.__class__.quantile, @@ -758,20 +747,6 @@ def reduce( Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process - if self._obj.ndim > 1: - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) - if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) @@ -780,43 +755,6 @@ def reduce_array(ar): return self.apply(reduce_array, shortcut=shortcut) - # TODO remove the following class method and DEFAULT_DIMS after the - # deprecation cycle - @classmethod - def _reduce_method(cls, func, include_skipna, numeric_only): - if include_skipna: - - def wrapped_func( - self, - dim=DEFAULT_DIMS, - axis=None, - skipna=None, - keep_attrs=None, - **kwargs - ): - return self.reduce( - func, - dim, - axis, - keep_attrs=keep_attrs, - skipna=skipna, - allow_lazy=True, - **kwargs - ) - - else: - - def wrapped_func( # type: ignore - self, dim=DEFAULT_DIMS, axis=None, keep_attrs=None, **kwargs - ): - return self.reduce( - func, dim, axis, keep_attrs=keep_attrs, allow_lazy=True, **kwargs - ) - - return wrapped_func - - -DEFAULT_DIMS = utils.ReprObject("") ops.inject_reduce_methods(DataArrayGroupBy) ops.inject_binary_ops(DataArrayGroupBy) @@ -898,19 +836,7 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process. Do not forget to remove _reduce_method - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) - elif dim is None: + if dim is None: dim = self._group_dim if keep_attrs is None: @@ -921,31 +847,6 @@ def reduce_dataset(ds): return self.apply(reduce_dataset) - # TODO remove the following class method and DEFAULT_DIMS after the - # deprecation cycle - @classmethod - def _reduce_method(cls, func, include_skipna, numeric_only): - if include_skipna: - - def wrapped_func(self, dim=DEFAULT_DIMS, skipna=None, **kwargs): - return self.reduce( - func, - dim, - skipna=skipna, - numeric_only=numeric_only, - allow_lazy=True, - **kwargs - ) - - else: - - def wrapped_func(self, dim=DEFAULT_DIMS, **kwargs): # type: ignore - return self.reduce( - func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs - ) - - return wrapped_func - def assign(self, **kwargs): """Assign data variables by group. diff --git a/xarray/core/resample.py b/xarray/core/resample.py index de70ebb6950..1f2e5c0be43 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -1,5 +1,5 @@ from . import ops -from .groupby import DEFAULT_DIMS, DataArrayGroupBy, DatasetGroupBy +from .groupby import DataArrayGroupBy, DatasetGroupBy RESAMPLE_DIM = "__resample_dim__" @@ -307,9 +307,6 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = None - return super().reduce(func, dim, keep_attrs, **kwargs) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 49980c75b15..01e92bdd7be 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2499,16 +2499,6 @@ def test_groupby_sum(self): assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y")) assert_allclose(expected_sum_axis1, grouped.sum("y")) - def test_groupby_warning(self): - array = self.make_groupby_example_array() - grouped = array.groupby("y") - with pytest.warns(FutureWarning): - grouped.sum() - - @pytest.mark.skipif( - LooseVersion(xr.__version__) < LooseVersion("0.13"), - reason="not to forget the behavior change", - ) def test_groupby_sum_default(self): array = self.make_groupby_example_array() grouped = array.groupby("abc") @@ -2529,7 +2519,7 @@ def test_groupby_sum_default(self): } )["foo"] - assert_allclose(expected_sum_all, grouped.sum()) + assert_allclose(expected_sum_all, grouped.sum(dim="y")) def test_groupby_count(self): array = DataArray( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d8401e0bd42..7d2b11d02c9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3367,18 +3367,6 @@ def test_groupby_reduce(self): actual = data.groupby("letters").mean(ALL_DIMS) assert_allclose(expected, actual) - def test_groupby_warn(self): - data = Dataset( - { - "xy": (["x", "y"], np.random.randn(3, 4)), - "xonly": ("x", np.random.randn(3)), - "yonly": ("y", np.random.randn(4)), - "letters": ("y", ["a", "a", "b", "b"]), - } - ) - with pytest.warns(FutureWarning): - data.groupby("x").mean() - def test_groupby_math(self): def reorder_dims(x): return x.transpose("dim1", "dim2", "dim3", "time") diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 9127eb71cb7..ee17cc39064 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -134,21 +134,21 @@ def test_da_groupby_quantile(): [("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])], ) - actual_x = array.groupby("x").quantile(0) + actual_x = array.groupby("x").quantile(0, dim=xr.ALL_DIMS) expected_x = xr.DataArray([1, 4], [("x", [1, 2])]) assert_identical(expected_x, actual_x) - actual_y = array.groupby("y").quantile(0) + actual_y = array.groupby("y").quantile(0, dim=xr.ALL_DIMS) expected_y = xr.DataArray([1, 22], [("y", [0, 1])]) assert_identical(expected_y, actual_y) - actual_xx = array.groupby("x").quantile(0, dim="x") + actual_xx = array.groupby("x").quantile(0) expected_xx = xr.DataArray( [[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])] ) assert_identical(expected_xx, actual_xx) - actual_yy = array.groupby("y").quantile(0, dim="y") + actual_yy = array.groupby("y").quantile(0) expected_yy = xr.DataArray( [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], [("x", [1, 1, 1, 2, 2]), ("y", [0, 1])], @@ -164,7 +164,7 @@ def test_da_groupby_quantile(): ) g = foo.groupby(foo.time.dt.month) - actual = g.quantile(0) + actual = g.quantile(0, dim=xr.ALL_DIMS) expected = xr.DataArray( [ 0.0, From 9fbe353d55feddb1cc42a8957171a07b23dd403f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Sep 2019 18:50:04 +0000 Subject: [PATCH 33/43] auto_combine deprecation to 0.14 (#3314) --- xarray/backends/api.py | 2 +- xarray/core/combine.py | 4 ++-- xarray/tests/test_combine.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1f0869cfc53..0d6dedac57e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -915,7 +915,7 @@ def open_mfdataset( # Remove this after deprecation cycle from #2616 is complete basic_msg = dedent( """\ - In xarray version 0.13 the default behaviour of `open_mfdataset` + In xarray version 0.14 the default behaviour of `open_mfdataset` will change. To retain the existing behavior, pass combine='nested'. To use future default behavior, pass combine='by_coords'. See diff --git a/xarray/core/combine.py b/xarray/core/combine.py index e35bb51e030..be7fd86555c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -716,7 +716,7 @@ def auto_combine( if not from_openmfds: basic_msg = dedent( """\ - In xarray version 0.13 `auto_combine` will be deprecated. See + In xarray version 0.14 `auto_combine` will be deprecated. See http://xarray.pydata.org/en/stable/combining.html#combining-multi""" ) warnings.warn(basic_msg, FutureWarning, stacklevel=2) @@ -758,7 +758,7 @@ def auto_combine( message += dedent( """\ The datasets supplied require both concatenation and merging. From - xarray version 0.13 this will operation will require either using the + xarray version 0.14 this will operation will require either using the new `combine_nested` function (or the `combine='nested'` option to open_mfdataset), with a nested list structure such that you can combine along the dimensions {}. Alternatively if your datasets have global diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 1abca30d199..6037669ac07 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -714,7 +714,7 @@ def test_check_for_impossible_ordering(self): @pytest.mark.filterwarnings( - "ignore:In xarray version 0.13 `auto_combine` " "will be deprecated" + "ignore:In xarray version 0.14 `auto_combine` " "will be deprecated" ) @pytest.mark.filterwarnings("ignore:Also `open_mfdataset` will no longer") @pytest.mark.filterwarnings("ignore:The datasets supplied") From 5b727951b60f7fa7c096b5f0a6583aad3bff11dd Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 17 Sep 2019 12:57:50 -0600 Subject: [PATCH 34/43] Release v0.13.0 --- doc/api.rst | 1 + doc/whats-new.rst | 46 ++++++++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 699687441d7..9b3d6dfaf95 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -610,6 +610,7 @@ Plotting Dataset.plot DataArray.plot + Dataset.plot.scatter plot.plot plot.contourf plot.contour diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 57da2910d6e..c40f7fc64ed 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,15 +15,20 @@ What's New .. _whats-new.0.13.0: -v0.13.0 (unreleased) --------------------- +v0.13.0 (10 July 2019) +---------------------- -This release increases the minimum required Python version from 3.5.0 to 3.5.3 -(:issue:`3089`). By `Guido Imperiale `_. +This release includes many exciting changes: wrapping of +`NEP18 `_ compliant +numpy-like arrays; new :py:meth:`~Dataset.plot.scatter` method that can scatter +two ``DataArrays`` in a ``Dataset`` against each other; support for converting pandas +DataFrames to xarray objects that wrap ``pydata/sparse``; and more! Breaking changes ~~~~~~~~~~~~~~~~ +- This release increases the minimum required Python version from 3.5.0 to 3.5.3 + (:issue:`3089`). By `Guido Imperiale `_. - The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated since v0.10.0. These are redundant with the ``isel`` / ``sel`` methods. See :ref:`vectorized_indexing` for the details @@ -54,16 +59,16 @@ Breaking changes error in a later release. (:issue:`3250`) by `Guido Imperiale `_. -- The default dimension for :py:meth:`~xarray.Dataset.groupby`, :py:meth:`~xarray.Dataset.resample`, - :py:meth:`~xarray.DataArray.groupby` and :py:meth:`~xarray.DataArray.resample` reductions is now the +- The default dimension for :py:meth:`Dataset.groupby`, :py:meth:`Dataset.resample`, + :py:meth:`DataArray.groupby` and :py:meth:`DataArray.resample` reductions is now the grouping or resampling dimension. -- :py:meth:`~Dataset.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous +- :py:meth:`DataArray.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous positional arguments were deprecated) - Reindexing with variables of a different dimension now raise an error (previously deprecated) - :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of :py:func:`~xarray.broadcast`) -- :py:meth:`~Variable.expand_dims` is removed (previously deprecated in favor of - :py:meth:`~Variable.set_dims`) +- :py:meth:`Variable.expand_dims` is removed (previously deprecated in favor of + :py:meth:`Variable.set_dims`) New functions/methods ~~~~~~~~~~~~~~~~~~~~~ @@ -102,13 +107,15 @@ New functions/methods Currently only :py:meth:`Dataset.plot.scatter` is implemented. By `Yohai Bar Sinai `_ and `Deepak Cherian `_ -- Added `head`, `tail` and `thin` methods to `Dataset` and `DataArray`. (:issue:`319`) - By `Gerardo Rivera `_. +- Added :py:meth:`DataArray.head`, :py:meth:`DataArray.tail` and :py:meth:`DataArray.thin`; + as well as :py:meth:`Dataset.head`, :py:meth:`Dataset.tail` and :py:meth:`Dataset.thin` methods. + (:issue:`319`) By `Gerardo Rivera `_. Enhancements ~~~~~~~~~~~~ - Multiple enhancements to :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset`. + By `Deepak Cherian `_ - Added ``compat='override'``. When merging, this option picks the variable from the first dataset and skips all comparisons. @@ -131,8 +138,6 @@ Enhancements data = xr.open_mfdataset(files, concat_dim='time', data_vars='minimal', coords='minimal', compat='override', join='override') - By `Deepak Cherian `_: - - In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if ``append_dim`` is set, as it will automatically be set to ``'a'`` internally. By `David Brochart `_. @@ -156,7 +161,8 @@ Enhancements when the user passes invalid arguments (:issue:`3176`). By `Gregory Gundersen `_. -- :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. By `Spencer Jones `_. +- :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. + By `Spencer Jones `_. Bug fixes ~~~~~~~~~ @@ -195,9 +201,7 @@ Bug fixes - Plots in 2 dimensions (pcolormesh, contour) now allow to specify levels as numpy array (:issue:`3284`). By `Mathias Hauser `_. - Fixed bug in :meth:`DataArray.quantile` failing to keep attributes when - `keep_attrs` was True (:issue:`3304`). By David Huard `_. - -.. _whats-new.0.12.3: + `keep_attrs` was True (:issue:`3304`). By David Huard ``_. Documentation ~~~~~~~~~~~~~ @@ -210,6 +214,8 @@ Documentation (:issue:`3227`). By `Gregory Gundersen `_. +.. _whats-new.0.12.3: + v0.12.3 (10 July 2019) ---------------------- @@ -224,14 +230,14 @@ New functions/methods as described in :ref:`reshape.stacking_different`. By `Noah Brenowitz `_. +Enhancements +~~~~~~~~~~~~ + - Support for renaming ``Dataset`` variables and dimensions independently with :py:meth:`~Dataset.rename_vars` and :py:meth:`~Dataset.rename_dims` (:issue:`3026`). By `Julia Kent `_. -Enhancements -~~~~~~~~~~~~ - - Add ``scales``, ``offsets``, ``units`` and ``descriptions`` attributes to :py:class:`~xarray.DataArray` returned by :py:func:`~xarray.open_rasterio`. (:issue:`3013`) From 3814f3cdbab6da968f51767c4e67d18bdf077761 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 17 Sep 2019 13:35:24 -0600 Subject: [PATCH 35/43] Revert to dev version --- doc/whats-new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c40f7fc64ed..63e0d34523c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,11 @@ What's New import xarray as xr np.random.seed(123456) +.. _whats-new.0.13.1: + +v0.13.1 (unreleased) +-------------------- + .. _whats-new.0.13.0: v0.13.0 (10 July 2019) From 02e96618ccdc13d6fa7165278b934ab204dfeef2 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 17 Sep 2019 13:42:37 -0600 Subject: [PATCH 36/43] Fix whats-new date :/ --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 63e0d34523c..d50b2d53f92 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -20,8 +20,8 @@ v0.13.1 (unreleased) .. _whats-new.0.13.0: -v0.13.0 (10 July 2019) ----------------------- +v0.13.0 (17 Sep 2019) +--------------------- This release includes many exciting changes: wrapping of `NEP18 `_ compliant From fedc95ff760f1bb6a8325e318e548ceeae9900bf Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 17 Sep 2019 20:46:34 -0700 Subject: [PATCH 37/43] Clarify that "scatter" is a plotting method in what's new. (#3316) When I read this, I thought it was referring to scattering data somehow :). --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d50b2d53f92..42da3fa8e63 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,7 +25,7 @@ v0.13.0 (17 Sep 2019) This release includes many exciting changes: wrapping of `NEP18 `_ compliant -numpy-like arrays; new :py:meth:`~Dataset.plot.scatter` method that can scatter +numpy-like arrays; new :py:meth:`~Dataset.plot.scatter` plotting method that can scatter two ``DataArrays`` in a ``Dataset`` against each other; support for converting pandas DataFrames to xarray objects that wrap ``pydata/sparse``; and more! From fddced063b7ecbea6254dc1008bb4db15a5d9304 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Wed, 18 Sep 2019 16:53:50 +0100 Subject: [PATCH 38/43] Allow weakref (#3318) * Allow weakref * black * What's New tweak * Update doc/whats-new.rst Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 8 ++++++++ xarray/core/dataarray.py | 10 +++++++++- xarray/core/dataset.py | 1 + xarray/tests/test_dataarray.py | 11 +++++++++++ xarray/tests/test_dataset.py | 11 +++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 42da3fa8e63..eeb768224e6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -18,6 +18,14 @@ What's New v0.13.1 (unreleased) -------------------- +Bug fixes +~~~~~~~~~ +- Reintroduce support for :mod:`weakref` (broken in v0.13.0). Support has been + reinstated for :class:`DataArray` and :class:`Dataset` objects only. Internal xarray + objects remain unaddressable by weakref in order to save memory. + (:issue:`3317`) by `Guido Imperiale `_. + + .. _whats-new.0.13.0: v0.13.0 (17 Sep 2019) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d9e98839419..13a3a507c7b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -248,7 +248,15 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ - __slots__ = ("_accessors", "_coords", "_file_obj", "_name", "_indexes", "_variable") + __slots__ = ( + "_accessors", + "_coords", + "_file_obj", + "_name", + "_indexes", + "_variable", + "__weakref__", + ) _groupby_cls = groupby.DataArrayGroupBy _rolling_cls = rolling.DataArrayRolling diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 693e94e22dd..3ba1bd9e3d8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -420,6 +420,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): "_file_obj", "_indexes", "_variables", + "__weakref__", ) _groupby_cls = groupby.DatasetGroupBy diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 01e92bdd7be..9ba3eecc5a0 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4672,3 +4672,14 @@ class MyArray(DataArray): pass assert str(e.value) == "MyArray must explicitly define __slots__" + + +def test_weakref(): + """Classes with __slots__ are incompatible with the weakref module unless they + explicitly state __weakref__ among their slots + """ + from weakref import ref + + a = DataArray(1) + r = ref(a) + assert r() is a diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7d2b11d02c9..f02990a1be9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5786,3 +5786,14 @@ class MyDS(Dataset): pass assert str(e.value) == "MyDS must explicitly define __slots__" + + +def test_weakref(): + """Classes with __slots__ are incompatible with the weakref module unless they + explicitly state __weakref__ among their slots + """ + from weakref import ref + + ds = Dataset() + r = ref(ds) + assert r() is ds From df259331c5b66088f67738338e0b1b3f940e09c2 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 18 Sep 2019 11:33:15 -0700 Subject: [PATCH 39/43] Fix isel performance regression (#3319) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix isel performance regression xref GH2227 Before: indexing.BooleanIndexing.time_indexing 898±0ms After indexing.BooleanIndexing.time_indexing 401±0ms * mypy fix --- asv_bench/benchmarks/indexing.py | 13 +++++++++++++ xarray/core/dataset.py | 12 ++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index cd212895d99..c4cfbbbdfdf 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -125,3 +125,16 @@ def setup(self, key): requires_dask() super().setup(key) self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + + +class BooleanIndexing: + # https://github.com/pydata/xarray/issues/2227 + def setup(self): + self.ds = xr.Dataset( + {"a": ("time", np.arange(10_000_000))}, + coords={"time": np.arange(10_000_000)}, + ) + self.time_filter = self.ds.time > 50_000 + + def time_indexing(self): + self.ds.isel(time=self.time_filter) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3ba1bd9e3d8..270ab81d264 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1781,7 +1781,7 @@ def _validate_indexers( elif isinstance(v, Dataset): raise TypeError("cannot use a Dataset as an indexer") elif isinstance(v, Sequence) and len(v) == 0: - v = IndexVariable((k,), np.zeros((0,), dtype="int64")) + v = Variable((k,), np.zeros((0,), dtype="int64")) else: v = np.asarray(v) @@ -1795,16 +1795,13 @@ def _validate_indexers( if v.ndim == 0: v = Variable((), v) elif v.ndim == 1: - v = IndexVariable((k,), v) + v = Variable((k,), v) else: raise IndexError( "Unlabeled multi-dimensional array cannot be " "used for indexing: {}".format(k) ) - if v.ndim == 1: - v = v.to_index_variable() - indexers_list.append((k, v)) return indexers_list @@ -2367,7 +2364,10 @@ def interp( if kwargs is None: kwargs = {} coords = either_dict_or_kwargs(coords, coords_kwargs, "interp") - indexers = OrderedDict(self._validate_indexers(coords)) + indexers = OrderedDict( + (k, v.to_index_variable() if isinstance(v, Variable) and v.ndim == 1 else v) + for k, v in self._validate_indexers(coords) + ) obj = self if assume_sorted else self.sortby([k for k in coords]) From 4617e68bd14250a0da1448c66f25b2f44f8b60c8 Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 21 Sep 2019 03:16:10 +0200 Subject: [PATCH 40/43] fix the doc names of the return value of swap_dims (#3329) --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 13a3a507c7b..e4379bd50fb 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1453,7 +1453,7 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": Returns ------- - renamed : Dataset + swapped : Dataset DataArray with swapped dimensions. See Also diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 270ab81d264..e01d83dd0a3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2676,7 +2676,7 @@ def swap_dims( Returns ------- - renamed : Dataset + swapped : Dataset Dataset with swapped dimensions. See Also From a5fe56a081ef59a7fffd1408bb18df2c9dfb4d1e Mon Sep 17 00:00:00 2001 From: keewis Date: Sun, 22 Sep 2019 18:30:24 +0200 Subject: [PATCH 41/43] Improve the documentation of swap_dims (#3331) * add an example to DataArray.swap_dims (and fix the documented return type) * add an example to Dataset.swap_dims * fix some errors in the swapped array's repr * remove a newline * mention changes in whatsnew --- doc/whats-new.rst | 4 ++++ xarray/core/dataarray.py | 19 ++++++++++++++++++- xarray/core/dataset.py | 23 +++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eeb768224e6..d8d3382675e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,10 @@ Bug fixes objects remain unaddressable by weakref in order to save memory. (:issue:`3317`) by `Guido Imperiale `_. +Documentation +~~~~~~~~~~~~~ +- Add examples for :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims`. + By `Justus Magin `_. .. _whats-new.0.13.0: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e4379bd50fb..add2b1b6c01 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1453,9 +1453,26 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": Returns ------- - swapped : Dataset + swapped : DataArray DataArray with swapped dimensions. + Examples + -------- + >>> arr = xr.DataArray(data=[0, 1], dims="x", + coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + >>> arr + + array([0, 1]) + Coordinates: + * x (x) >> arr.swap_dims({"x": "y"}) + + array([0, 1]) + Coordinates: + x (y) >> ds = xr.Dataset(data_vars={"a": ("x", [5, 7]), "b": ("x", [0.1, 2.4])}, + coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + >>> ds + + Dimensions: (x: 2) + Coordinates: + * x (x) >> ds.swap_dims({"x": "y"}) + + Dimensions: (y: 2) + Coordinates: + x (y) Date: Mon, 23 Sep 2019 03:08:42 +0200 Subject: [PATCH 42/43] More doc fixes (#3333) * fix a code section being hidden * fix the code samples of DataArray.plot * fix a cross-reference --- doc/api.rst | 2 +- doc/reshaping.rst | 1 + xarray/core/dataarray.py | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index 9b3d6dfaf95..256a1dbf3af 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -8,7 +8,7 @@ This page provides an auto-generated summary of xarray's API. For more details and examples, refer to the relevant chapters in the main part of the documentation. -See also: :ref:`public api`_. +See also: :ref:`public api` Top-level functions =================== diff --git a/doc/reshaping.rst b/doc/reshaping.rst index b3abfc5afb0..51202f9be41 100644 --- a/doc/reshaping.rst +++ b/doc/reshaping.rst @@ -156,6 +156,7 @@ represented by a :py:class:`pandas.MultiIndex` object. These methods are used like this: .. ipython:: python + data = xr.Dataset( data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]), 'b': ('x', [6, 7])}, diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index add2b1b6c01..e63b6c9975f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2558,10 +2558,12 @@ def plot(self) -> _PlotMethods: >>> d = DataArray([[1, 2], [3, 4]]) For convenience just call this directly + >>> d.plot() Or use it as a namespace to use xarray.plot functions as DataArray methods + >>> d.plot.imshow() # equivalent to xarray.plot.imshow(d) """ From e1183e8d75b5a46e08442b69f0b67cd187399a07 Mon Sep 17 00:00:00 2001 From: rdturnermtl <28273671+rdturnermtl@users.noreply.github.com> Date: Mon, 23 Sep 2019 12:22:54 -0700 Subject: [PATCH 43/43] Add hypothesis support to related projects (#3335) * add hypothesis-gufunc to related projects. * keep in alphabetical order * Update doc/related-projects.rst Switch to documentation. Co-Authored-By: Deepak Cherian --- doc/related-projects.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/related-projects.rst b/doc/related-projects.rst index 58b9a7c22c9..647db5fd8e4 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -11,7 +11,7 @@ Geosciences ~~~~~~~~~~~ - `aospy `_: Automated analysis and management of gridded climate data. -- `climpred `_: Analysis of ensemble forecast models for climate prediction. +- `climpred `_: Analysis of ensemble forecast models for climate prediction. - `infinite-diff `_: xarray-based finite-differencing, focused on gridded climate/meterology data - `marc_analysis `_: Analysis package for CESM/MARC experiments and output. - `MetPy `_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data. @@ -26,7 +26,7 @@ Geosciences subclass. - `Regionmask `_: plotting and creation of masks of spatial regions - `salem `_: Adds geolocalised subsetting, masking, and plotting operations to xarray's data structures via accessors. -- `SatPy `_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats. +- `SatPy `_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats. - `Spyfit `_: FTIR spectroscopy of the atmosphere - `windspharm `_: Spherical harmonic wind analysis in Python. @@ -56,6 +56,7 @@ Extend xarray capabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~ - `Collocate `_: Collocate xarray trajectories in arbitrary physical dimensions - `eofs `_: EOF analysis in Python. +- `hypothesis-gufunc `_: Extension to hypothesis. Makes it easy to write unit tests with xarray objects as input. - `xarray_extras `_: Advanced algorithms for xarray objects (e.g. integrations/interpolations). - `xrft `_: Fourier transforms for xarray data. - `xr-scipy `_: A lightweight scipy wrapper for xarray.