diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 9314361e998..aa9662d44f9 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -1,4 +1,5 @@ import numpy as np + import xarray as xr diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index cd212895d99..c4cfbbbdfdf 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -125,3 +125,16 @@ def setup(self, key): requires_dask() super().setup(key) self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + + +class BooleanIndexing: + # https://github.com/pydata/xarray/issues/2227 + def setup(self): + self.ds = xr.Dataset( + {"a": ("time", np.arange(10_000_000))}, + coords={"time": np.arange(10_000_000)}, + ) + self.time_filter = self.ds.time > 50_000 + + def time_indexing(self): + self.ds.isel(time=self.time_filter) diff --git a/doc/api.rst b/doc/api.rst index 872e7786e1b..256a1dbf3af 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -8,7 +8,7 @@ This page provides an auto-generated summary of xarray's API. For more details and examples, refer to the relevant chapters in the main part of the documentation. -See also: :ref:`public api`_. +See also: :ref:`public api` Top-level functions =================== @@ -117,6 +117,9 @@ Indexing Dataset.loc Dataset.isel Dataset.sel + Dataset.head + Dataset.tail + Dataset.thin Dataset.squeeze Dataset.interp Dataset.interp_like @@ -279,6 +282,9 @@ Indexing DataArray.loc DataArray.isel DataArray.sel + DataArray.head + DataArray.tail + DataArray.thin DataArray.squeeze DataArray.interp DataArray.interp_like @@ -604,6 +610,7 @@ Plotting Dataset.plot DataArray.plot + Dataset.plot.scatter plot.plot plot.contourf plot.contour diff --git a/doc/dask.rst b/doc/dask.rst index b0ffd0c449d..19cbc11292c 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -75,13 +75,14 @@ entirely equivalent to opening a dataset using ``open_dataset`` and then chunking the data using the ``chunk`` method, e.g., ``xr.open_dataset('example-data.nc').chunk({'time': 10})``. -To open multiple files simultaneously, use :py:func:`~xarray.open_mfdataset`:: +To open multiple files simultaneously in parallel using Dask delayed, +use :py:func:`~xarray.open_mfdataset`:: - xr.open_mfdataset('my/files/*.nc') + xr.open_mfdataset('my/files/*.nc', parallel=True) This function will automatically concatenate and merge dataset into one in the simple cases that it understands (see :py:func:`~xarray.auto_combine` -for the full disclaimer). By default, ``open_mfdataset`` will chunk each +for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can open each file individually using ``open_dataset`` and merge the result, as @@ -132,6 +133,13 @@ A dataset can also be converted to a Dask DataFrame using :py:meth:`~xarray.Data Dask DataFrames do not support multi-indexes so the coordinate variables from the dataset are included as columns in the Dask DataFrame. +.. ipython:: python + :suppress: + + import os + os.remove('example-data.nc') + os.remove('manipulated-example-data.nc') + Using Dask with xarray ---------------------- @@ -373,12 +381,6 @@ one million elements (e.g., a 1000x1000 matrix). With large arrays (10+ GB), the cost of queueing up Dask operations can be noticeable, and you may need even larger chunksizes. -.. ipython:: python - :suppress: - - import os - os.remove('example-data.nc') - Optimization Tips ----------------- diff --git a/doc/gallery/plot_cartopy_facetgrid.py b/doc/gallery/plot_cartopy_facetgrid.py index a0afa7ad92e..af04ad6856a 100644 --- a/doc/gallery/plot_cartopy_facetgrid.py +++ b/doc/gallery/plot_cartopy_facetgrid.py @@ -41,6 +41,6 @@ ax.set_extent([-160, -30, 5, 75]) # Without this aspect attributes the maps will look chaotic and the # "extent" attribute above will be ignored - ax.set_aspect("equal", "box-forced") + ax.set_aspect("equal") plt.show() diff --git a/doc/indexing.rst b/doc/indexing.rst index 4c5b93db0b4..9ee8f1dddf8 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -236,9 +236,8 @@ The :py:meth:`~xarray.Dataset.drop` method returns a new object with the listed index labels along a dimension dropped: .. ipython:: python - :okwarning: - ds.drop(['IN', 'IL'], dim='space') + ds.drop(space=['IN', 'IL']) ``drop`` is both a ``Dataset`` and ``DataArray`` method. @@ -393,14 +392,6 @@ These methods may also be applied to ``Dataset`` objects You may find increased performance by loading your data into memory first, e.g., with :py:meth:`~xarray.Dataset.load`. -.. note:: - - Vectorized indexing is a new feature in v0.10. - In older versions of xarray, dimensions of indexers are ignored. - Dedicated methods for some advanced indexing use cases, - ``isel_points`` and ``sel_points`` are now deprecated. - See :ref:`more_advanced_indexing` for their alternative. - .. note:: If an indexer is a :py:meth:`~xarray.DataArray`, its coordinates should not diff --git a/doc/io.rst b/doc/io.rst index 4a61b59ac2a..775d915188e 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -99,7 +99,9 @@ netCDF The recommended way to store xarray data structures is `netCDF`__, which is a binary file format for self-described datasets that originated in the geosciences. xarray is based on the netCDF data model, so netCDF files -on disk directly correspond to :py:class:`~xarray.Dataset` objects. +on disk directly correspond to :py:class:`~xarray.Dataset` objects (more accurately, +a group in a netCDF file directly corresponds to a to :py:class:`~xarray.Dataset` object. +See :ref:`io.netcdf_groups` for more.) NetCDF is supported on almost all platforms, and parsers exist for the vast majority of scientific programming languages. Recent versions of @@ -121,7 +123,7 @@ read/write netCDF V4 files and use the compression options described below). __ https://github.com/Unidata/netcdf4-python We can save a Dataset to disk using the -:py:attr:`Dataset.to_netcdf ` method: +:py:meth:`~Dataset.to_netcdf` method: .. ipython:: python @@ -147,19 +149,6 @@ convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back when loading, ensuring that the ``DataArray`` that is loaded is always exactly the same as the one that was saved. -NetCDF groups are not supported as part of the -:py:class:`~xarray.Dataset` data model. Instead, groups can be loaded -individually as Dataset objects. -To do so, pass a ``group`` keyword argument to the -``open_dataset`` function. The group can be specified as a path-like -string, e.g., to access subgroup 'bar' within group 'foo' pass -'/foo/bar' as the ``group`` argument. -In a similar way, the ``group`` keyword argument can be given to the -:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group -in a netCDF file. -When writing multiple groups in one file, pass ``mode='a'`` to ``to_netcdf`` -to ensure that each call does not delete the file. - Data is always loaded lazily from netCDF files. You can manipulate, slice and subset Dataset and DataArray objects, and no array values are loaded into memory until you try to perform some sort of actual computation. For an example of how these @@ -195,6 +184,24 @@ It is possible to append or overwrite netCDF variables using the ``mode='a'`` argument. When using this option, all variables in the dataset will be written to the original netCDF file, regardless if they exist in the original dataset. + +.. _io.netcdf_groups: + +Groups +~~~~~~ + +NetCDF groups are not supported as part of the :py:class:`~xarray.Dataset` data model. +Instead, groups can be loaded individually as Dataset objects. +To do so, pass a ``group`` keyword argument to the +:py:func:`~xarray.open_dataset` function. The group can be specified as a path-like +string, e.g., to access subgroup ``'bar'`` within group ``'foo'`` pass +``'/foo/bar'`` as the ``group`` argument. +In a similar way, the ``group`` keyword argument can be given to the +:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group +in a netCDF file. +When writing multiple groups in one file, pass ``mode='a'`` to +:py:meth:`~xarray.Dataset.to_netcdf` to ensure that each call does not delete the file. + .. _io.encoding: Reading encoded data @@ -203,7 +210,7 @@ Reading encoded data NetCDF files follow some conventions for encoding datetime arrays (as numbers with a "units" attribute) and for packing and unpacking data (as described by the "scale_factor" and "add_offset" attributes). If the argument -``decode_cf=True`` (default) is given to ``open_dataset``, xarray will attempt +``decode_cf=True`` (default) is given to :py:func:`~xarray.open_dataset`, xarray will attempt to automatically decode the values in the netCDF objects according to `CF conventions`_. Sometimes this will fail, for example, if a variable has an invalid "units" or "calendar" attribute. For these cases, you can @@ -247,6 +254,130 @@ will remove encoding information. import os os.remove('saved_on_disk.nc') + +.. _combining multiple files: + +Reading multi-file datasets +........................... + +NetCDF files are often encountered in collections, e.g., with different files +corresponding to different model runs or one file per timestamp. +xarray can straightforwardly combine such files into a single Dataset by making use of +:py:func:`~xarray.concat`, :py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and +:py:func:`~xarray.combine_by_coords`. For details on the difference between these +functions see :ref:`combining data`. + +Xarray includes support for manipulating datasets that don't fit into memory +with dask_. If you have dask installed, you can open multiple files +simultaneously in parallel using :py:func:`~xarray.open_mfdataset`:: + + xr.open_mfdataset('my/files/*.nc', parallel=True) + +This function automatically concatenates and merges multiple files into a +single xarray dataset. +It is the recommended way to open multiple files with xarray. +For more details on parallel reading, see :ref:`combining.multi`, :ref:`dask.io` and a +`blog post`_ by Stephan Hoyer. +:py:func:`~xarray.open_mfdataset` takes many kwargs that allow you to +control its behaviour (for e.g. ``parallel``, ``combine``, ``compat``, ``join``, ``concat_dim``). +See its docstring for more details. + + +.. note:: + + A common use-case involves a dataset distributed across a large number of files with + each file containing a large number of variables. Commonly a few of these variables + need to be concatenated along a dimension (say ``"time"``), while the rest are equal + across the datasets (ignoring floating point differences). The following command + with suitable modifications (such as ``parallel=True``) works well with such datasets:: + + xr.open_mfdataset('my/files/*.nc', concat_dim="time", + data_vars='minimal', coords='minimal', compat='override') + + This command concatenates variables along the ``"time"`` dimension, but only those that + already contain the ``"time"`` dimension (``data_vars='minimal', coords='minimal'``). + Variables that lack the ``"time"`` dimension are taken from the first dataset + (``compat='override'``). + + +.. _dask: http://dask.pydata.org +.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ + +Sometimes multi-file datasets are not conveniently organized for easy use of :py:func:`~xarray.open_mfdataset`. +One can use the ``preprocess`` argument to provide a function that takes a dataset +and returns a modified Dataset. +:py:func:`~xarray.open_mfdataset` will call ``preprocess`` on every dataset +(corresponding to each file) prior to combining them. + + +If :py:func:`~xarray.open_mfdataset` does not meet your needs, other approaches are possible. +The general pattern for parallel reading of multiple files +using dask, modifying those datasets and then combining into a single ``Dataset`` is:: + + def modify(ds): + # modify ds here + return ds + + + # this is basically what open_mfdataset does + open_kwargs = dict(decode_cf=True, decode_times=False) + open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] + tasks = [dask.delayed(modify)(task) for task in open_tasks] + datasets = dask.compute(tasks) # get a list of xarray.Datasets + combined = xr.combine_nested(datasets) # or some combination of concat, merge + + +As an example, here's how we could approximate ``MFDataset`` from the netCDF4 +library:: + + from glob import glob + import xarray as xr + + def read_netcdfs(files, dim): + # glob expands paths with * to a list of files, like the unix shell + paths = sorted(glob(files)) + datasets = [xr.open_dataset(p) for p in paths] + combined = xr.concat(dataset, dim) + return combined + + combined = read_netcdfs('/all/my/files/*.nc', dim='time') + +This function will work in many cases, but it's not very robust. First, it +never closes files, which means it will fail one you need to load more than +a few thousands file. Second, it assumes that you want all the data from each +file and that it can all fit into memory. In many situations, you only need +a small subset or an aggregated summary of the data from each file. + +Here's a slightly more sophisticated example of how to remedy these +deficiencies:: + + def read_netcdfs(files, dim, transform_func=None): + def process_one_path(path): + # use a context manager, to ensure the file gets closed after use + with xr.open_dataset(path) as ds: + # transform_func should do some sort of selection or + # aggregation + if transform_func is not None: + ds = transform_func(ds) + # load all data from the transformed dataset, to ensure we can + # use it after closing each original file + ds.load() + return ds + + paths = sorted(glob(files)) + datasets = [process_one_path(p) for p in paths] + combined = xr.concat(datasets, dim) + return combined + + # here we suppose we only care about the combined mean of each file; + # you might also use indexing operations like .sel to subset datasets + combined = read_netcdfs('/all/my/files/*.nc', dim='time', + transform_func=lambda ds: ds.mean()) + +This pattern works well and is very robust. We've used similar code to process +tens of thousands of files constituting 100s of GB of data. + + .. _io.netcdf.writing_encoded: Writing encoded data @@ -743,6 +874,13 @@ be done directly from zarr, as described in the .. _io.cfgrib: +.. ipython:: python + :suppress: + + import shutil + shutil.rmtree('foo.zarr') + shutil.rmtree('path/to/directory.zarr') + GRIB format via cfgrib ---------------------- @@ -810,84 +948,3 @@ For CSV files, one might also consider `xarray_extras`_. .. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html - - -.. _combining multiple files: - - -Combining multiple files ------------------------- - -NetCDF files are often encountered in collections, e.g., with different files -corresponding to different model runs. xarray can straightforwardly combine such -files into a single Dataset by making use of :py:func:`~xarray.concat`, -:py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and -:py:func:`~xarray.combine_by_coords`. For details on the difference between these -functions see :ref:`combining data`. - -.. note:: - - Xarray includes support for manipulating datasets that don't fit into memory - with dask_. If you have dask installed, you can open multiple files - simultaneously using :py:func:`~xarray.open_mfdataset`:: - - xr.open_mfdataset('my/files/*.nc') - - This function automatically concatenates and merges multiple files into a - single xarray dataset. - It is the recommended way to open multiple files with xarray. - For more details, see :ref:`combining.multi`, :ref:`dask.io` and a - `blog post`_ by Stephan Hoyer. - -.. _dask: http://dask.pydata.org -.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ - -For example, here's how we could approximate ``MFDataset`` from the netCDF4 -library:: - - from glob import glob - import xarray as xr - - def read_netcdfs(files, dim): - # glob expands paths with * to a list of files, like the unix shell - paths = sorted(glob(files)) - datasets = [xr.open_dataset(p) for p in paths] - combined = xr.concat(dataset, dim) - return combined - - combined = read_netcdfs('/all/my/files/*.nc', dim='time') - -This function will work in many cases, but it's not very robust. First, it -never closes files, which means it will fail one you need to load more than -a few thousands file. Second, it assumes that you want all the data from each -file and that it can all fit into memory. In many situations, you only need -a small subset or an aggregated summary of the data from each file. - -Here's a slightly more sophisticated example of how to remedy these -deficiencies:: - - def read_netcdfs(files, dim, transform_func=None): - def process_one_path(path): - # use a context manager, to ensure the file gets closed after use - with xr.open_dataset(path) as ds: - # transform_func should do some sort of selection or - # aggregation - if transform_func is not None: - ds = transform_func(ds) - # load all data from the transformed dataset, to ensure we can - # use it after closing each original file - ds.load() - return ds - - paths = sorted(glob(files)) - datasets = [process_one_path(p) for p in paths] - combined = xr.concat(datasets, dim) - return combined - - # here we suppose we only care about the combined mean of each file; - # you might also use indexing operations like .sel to subset datasets - combined = read_netcdfs('/all/my/files/*.nc', dim='time', - transform_func=lambda ds: ds.mean()) - -This pattern works well and is very robust. We've used similar code to process -tens of thousands of files constituting 100s of GB of data. diff --git a/doc/related-projects.rst b/doc/related-projects.rst index 58b9a7c22c9..647db5fd8e4 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -11,7 +11,7 @@ Geosciences ~~~~~~~~~~~ - `aospy `_: Automated analysis and management of gridded climate data. -- `climpred `_: Analysis of ensemble forecast models for climate prediction. +- `climpred `_: Analysis of ensemble forecast models for climate prediction. - `infinite-diff `_: xarray-based finite-differencing, focused on gridded climate/meterology data - `marc_analysis `_: Analysis package for CESM/MARC experiments and output. - `MetPy `_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data. @@ -26,7 +26,7 @@ Geosciences subclass. - `Regionmask `_: plotting and creation of masks of spatial regions - `salem `_: Adds geolocalised subsetting, masking, and plotting operations to xarray's data structures via accessors. -- `SatPy `_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats. +- `SatPy `_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats. - `Spyfit `_: FTIR spectroscopy of the atmosphere - `windspharm `_: Spherical harmonic wind analysis in Python. @@ -56,6 +56,7 @@ Extend xarray capabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~ - `Collocate `_: Collocate xarray trajectories in arbitrary physical dimensions - `eofs `_: EOF analysis in Python. +- `hypothesis-gufunc `_: Extension to hypothesis. Makes it easy to write unit tests with xarray objects as input. - `xarray_extras `_: Advanced algorithms for xarray objects (e.g. integrations/interpolations). - `xrft `_: Fourier transforms for xarray data. - `xr-scipy `_: A lightweight scipy wrapper for xarray. diff --git a/doc/reshaping.rst b/doc/reshaping.rst index b3abfc5afb0..51202f9be41 100644 --- a/doc/reshaping.rst +++ b/doc/reshaping.rst @@ -156,6 +156,7 @@ represented by a :py:class:`pandas.MultiIndex` object. These methods are used like this: .. ipython:: python + data = xr.Dataset( data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]), 'b': ('x', [6, 7])}, diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index a17ecd2f2a4..96641c2b97e 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -137,6 +137,12 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.to_netcdf('example-no-leap.nc') xr.open_dataset('example-no-leap.nc') +.. ipython:: python + :suppress: + + import os + os.remove('example-no-leap.nc') + - And resampling along the time dimension for data indexed by a :py:class:`~xarray.CFTimeIndex`: .. ipython:: python diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1fc96019c4d..39ca1c204c6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,13 +13,79 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.13.0: +.. _whats-new.0.13.1: -v0.13.0 (unreleased) +v0.13.1 (unreleased) -------------------- -This release increases the minimum required Python version from 3.5.0 to 3.5.3 -(:issue:`3089`). By `Guido Imperiale `_. +Bug fixes +~~~~~~~~~ +- Reintroduce support for :mod:`weakref` (broken in v0.13.0). Support has been + reinstated for :class:`DataArray` and :class:`Dataset` objects only. Internal xarray + objects remain unaddressable by weakref in order to save memory. + (:issue:`3317`) by `Guido Imperiale `_. + +Documentation +~~~~~~~~~~~~~ +- Add examples for :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims`. + By `Justus Magin `_. + +.. _whats-new.0.13.0: + +v0.13.0 (17 Sep 2019) +--------------------- + +This release includes many exciting changes: wrapping of +`NEP18 `_ compliant +numpy-like arrays; new :py:meth:`~Dataset.plot.scatter` plotting method that can scatter +two ``DataArrays`` in a ``Dataset`` against each other; support for converting pandas +DataFrames to xarray objects that wrap ``pydata/sparse``; and more! + +Breaking changes +~~~~~~~~~~~~~~~~ + +- This release increases the minimum required Python version from 3.5.0 to 3.5.3 + (:issue:`3089`). By `Guido Imperiale `_. +- The ``isel_points`` and ``sel_points`` methods are removed, having been deprecated + since v0.10.0. These are redundant with the ``isel`` / ``sel`` methods. + See :ref:`vectorized_indexing` for the details + By `Maximilian Roos `_ +- The ``inplace`` kwarg for public methods now raises an error, having been deprecated + since v0.11.0. + By `Maximilian Roos `_ +- :py:func:`~xarray.concat` now requires the ``dim`` argument. Its ``indexers``, ``mode`` + and ``concat_over`` kwargs have now been removed. + By `Deepak Cherian `_ +- Passing a list of colors in ``cmap`` will now raise an error, having been deprecated since + v0.6.1. +- Most xarray objects now define ``__slots__``. This reduces overall RAM usage by ~22% + (not counting the underlying numpy buffers); on CPython 3.7/x64, a trivial DataArray + has gone down from 1.9kB to 1.5kB. + + Caveats: + + - Pickle streams produced by older versions of xarray can't be loaded using this + release, and vice versa. + - Any user code that was accessing the ``__dict__`` attribute of + xarray objects will break. The best practice to attach custom metadata to xarray + objects is to use the ``attrs`` dictionary. + - Any user code that defines custom subclasses of xarray classes must now explicitly + define ``__slots__`` itself. Subclasses that don't add any attributes must state so + by defining ``__slots__ = ()`` right after the class header. + Omitting ``__slots__`` will now cause a ``FutureWarning`` to be logged, and will raise an + error in a later release. + + (:issue:`3250`) by `Guido Imperiale `_. +- The default dimension for :py:meth:`Dataset.groupby`, :py:meth:`Dataset.resample`, + :py:meth:`DataArray.groupby` and :py:meth:`DataArray.resample` reductions is now the + grouping or resampling dimension. +- :py:meth:`DataArray.to_dataset` requires ``name`` to be passed as a kwarg (previously ambiguous + positional arguments were deprecated) +- Reindexing with variables of a different dimension now raise an error (previously deprecated) +- :py:func:`~xarray.broadcast_array` is removed (previously deprecated in favor of + :py:func:`~xarray.broadcast`) +- :py:meth:`Variable.expand_dims` is removed (previously deprecated in favor of + :py:meth:`Variable.set_dims`) New functions/methods ~~~~~~~~~~~~~~~~~~~~~ @@ -28,10 +94,16 @@ New functions/methods `NEP18 `_ compliant numpy-like library (important: read notes about NUMPY_EXPERIMENTAL_ARRAY_FUNCTION in the above link). Added explicit test coverage for - `sparse `_. (:issue:`3117`, :issue:`3202`) - By `Nezar Abdennur `_ + `sparse `_. (:issue:`3117`, :issue:`3202`). + This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ and `Guido Imperiale `_. +- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now + support ``sparse=True`` for converting pandas objects into xarray objects + wrapping sparse arrays. This is particularly useful with sparsely populated + hierarchical indexes. (:issue:`3206`) + By `Stephan Hoyer `_. + - The xarray package is now discoverable by mypy (although typing hints coverage is not complete yet). mypy type checking is now enforced by CI. Libraries that depend on xarray and use mypy can now remove from their setup.cfg the lines:: @@ -45,13 +117,17 @@ New functions/methods and `Maximilian Roos `_. - Added :py:meth:`DataArray.broadcast_like` and :py:meth:`Dataset.broadcast_like`. - By `Deepak Cherian `_ and `David Mertz + By `Deepak Cherian `_ and `David Mertz `_. -- Dataset plotting API for visualizing dependencies between two `DataArray`s! +- Dataset plotting API for visualizing dependencies between two DataArrays! Currently only :py:meth:`Dataset.plot.scatter` is implemented. By `Yohai Bar Sinai `_ and `Deepak Cherian `_ +- Added :py:meth:`DataArray.head`, :py:meth:`DataArray.tail` and :py:meth:`DataArray.thin`; + as well as :py:meth:`Dataset.head`, :py:meth:`Dataset.tail` and :py:meth:`Dataset.thin` methods. + (:issue:`319`) By `Gerardo Rivera `_. + Enhancements ~~~~~~~~~~~~ @@ -61,19 +137,46 @@ Enhancements By `Robert Hetland ` - Added ``join='override'``. This only checks that index sizes are equal among objects and skips checking indexes for equality. By `Deepak Cherian `_. +- Multiple enhancements to :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset`. + By `Deepak Cherian `_ + + - Added ``compat='override'``. When merging, this option picks the variable from the first dataset + and skips all comparisons. + + - Added ``join='override'``. When aligning, this only checks that index sizes are equal among objects + and skips checking indexes for equality. + + - :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. + It is passed down to :py:func:`~xarray.align`. + + - :py:func:`~xarray.concat` now calls :py:func:`~xarray.merge` on variables that are not concatenated + (i.e. variables without ``concat_dim`` when ``data_vars`` or ``coords`` are ``"minimal"``). + :py:func:`~xarray.concat` passes its new ``compat`` kwarg down to :py:func:`~xarray.merge`. + (:issue:`2064`) -- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. - It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian `_. + Users can avoid a common bottleneck when using :py:func:`~xarray.open_mfdataset` on a large number of + files with variables that are known to be aligned and some of which need not be concatenated. + Slow equality comparisons can now be avoided, for e.g.:: + + data = xr.open_mfdataset(files, concat_dim='time', data_vars='minimal', + coords='minimal', compat='override', join='override') - In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if ``append_dim`` is set, as it will automatically be set to ``'a'`` internally. By `David Brochart `_. + +- Added the ability to initialize an empty or full DataArray + with a single value. (:issue:`277`) + By `Gerardo Rivera `_. + - :py:func:`~xarray.Dataset.to_netcdf()` now supports the ``invalid_netcdf`` kwarg when used with ``engine="h5netcdf"``. It is passed to :py:func:`h5netcdf.File`. By `Ulrich Herter `_. - :py:meth:`~xarray.Dataset.drop` now supports keyword arguments; dropping index - labels by specifying both ``dim`` and ``labels`` is deprecated (:issue:`2910`). + labels by using both ``dim`` and ``labels`` or using a + :py:class:`~xarray.core.coordinates.DataArrayCoordinates` object are + deprecated (:issue:`2910`). By `Gregory Gundersen `_. - Added examples of :py:meth:`Dataset.set_index` and @@ -81,9 +184,15 @@ Enhancements when the user passes invalid arguments (:issue:`3176`). By `Gregory Gundersen `_. +- :py:func:`filter_by_attrs` now filters the coordinates as well as the variables. + By `Spencer Jones `_. + Bug fixes ~~~~~~~~~ +- Improve "missing dimensions" error message for :py:func:`~xarray.apply_ufunc` + (:issue:`2078`). + By `Rick Russotto `_. - :py:meth:`~xarray.DataArray.assign_coords` now supports dictionary arguments (:issue:`3231`). By `Gregory Gundersen `_. @@ -110,8 +219,12 @@ Bug fixes - Fix error that arises when using open_mfdataset on a series of netcdf files having differing values for a variable attribute of type list. (:issue:`3034`) By `Hasan Ahmad `_. - -.. _whats-new.0.12.3: +- Prevent :py:meth:`~xarray.DataArray.argmax` and :py:meth:`~xarray.DataArray.argmin` from calling + dask compute (:issue:`3237`). By `Ulrich Herter `_. +- Plots in 2 dimensions (pcolormesh, contour) now allow to specify levels as numpy + array (:issue:`3284`). By `Mathias Hauser `_. +- Fixed bug in :meth:`DataArray.quantile` failing to keep attributes when + `keep_attrs` was True (:issue:`3304`). By David Huard ``_. Documentation ~~~~~~~~~~~~~ @@ -120,6 +233,12 @@ Documentation or pushing new commits. By `Gregory Gundersen `_. +- Fixed documentation to clean up unwanted files created in ``ipython`` examples + (:issue:`3227`). + By `Gregory Gundersen `_. + +.. _whats-new.0.12.3: + v0.12.3 (10 July 2019) ---------------------- @@ -134,14 +253,14 @@ New functions/methods as described in :ref:`reshape.stacking_different`. By `Noah Brenowitz `_. +Enhancements +~~~~~~~~~~~~ + - Support for renaming ``Dataset`` variables and dimensions independently with :py:meth:`~Dataset.rename_vars` and :py:meth:`~Dataset.rename_dims` (:issue:`3026`). By `Julia Kent `_. -Enhancements -~~~~~~~~~~~~ - - Add ``scales``, ``offsets``, ``units`` and ``descriptions`` attributes to :py:class:`~xarray.DataArray` returned by :py:func:`~xarray.open_rasterio`. (:issue:`3013`) diff --git a/doc/why-xarray.rst b/doc/why-xarray.rst index d0a6c591b29..25d558d99d5 100644 --- a/doc/why-xarray.rst +++ b/doc/why-xarray.rst @@ -62,9 +62,8 @@ The power of the dataset over a plain dictionary is that, in addition to pulling out arrays by name, it is possible to select or combine data along a dimension across all arrays simultaneously. Like a :py:class:`~pandas.DataFrame`, datasets facilitate array operations with -heterogeneous data -- the difference is that the arrays in a dataset can not -only have different data types, but can also have different numbers of -dimensions. +heterogeneous data -- the difference is that the arrays in a dataset can have +not only different data types, but also different numbers of dimensions. This data model is borrowed from the netCDF_ file format, which also provides xarray with a natural and portable serialization format. NetCDF is very popular diff --git a/setup.cfg b/setup.cfg index 6cb58d2b9a2..114f71f4a9f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,8 +25,6 @@ ignore= E731 # line break before binary operator W503 - # Unused imports; TODO: Allow typing to work without triggering errors - F401 exclude= doc diff --git a/xarray/__init__.py b/xarray/__init__.py index a3df034f7c7..cdca708e28c 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -6,7 +6,7 @@ __version__ = get_versions()["version"] del get_versions -from .core.alignment import align, broadcast, broadcast_arrays +from .core.alignment import align, broadcast from .core.common import full_like, zeros_like, ones_like from .core.concat import concat from .core.combine import combine_by_coords, combine_nested, auto_combine diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 887af0023fb..0d6dedac57e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -6,6 +6,7 @@ from pathlib import Path from textwrap import dedent from typing import ( + TYPE_CHECKING, Callable, Dict, Hashable, @@ -13,21 +14,19 @@ Mapping, Tuple, Union, - TYPE_CHECKING, ) import numpy as np -from .. import Dataset, DataArray, backends, conventions, coding +from .. import DataArray, Dataset, auto_combine, backends, coding, conventions from ..core import indexing -from .. import auto_combine from ..core.combine import ( - combine_by_coords, - _nested_combine, _infer_concat_order_from_positions, + _nested_combine, + combine_by_coords, ) from ..core.utils import close_on_error, is_grib_path, is_remote_uri -from .common import ArrayWriter, AbstractDataStore +from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler if TYPE_CHECKING: @@ -695,6 +694,8 @@ def open_dataarray( class _MultiFileCloser: + __slots__ = ("file_objs",) + def __init__(self, file_objs): self.file_objs = file_objs @@ -760,7 +761,7 @@ def open_mfdataset( `xarray.auto_combine` is used, but in the future this behavior will switch to use `xarray.combine_by_coords` by default. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: * 'broadcast_equals': all values must be equal when variables are @@ -771,6 +772,7 @@ def open_mfdataset( * 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + * 'override': skip comparing and pick variable from first dataset preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in @@ -913,7 +915,7 @@ def open_mfdataset( # Remove this after deprecation cycle from #2616 is complete basic_msg = dedent( """\ - In xarray version 0.13 the default behaviour of `open_mfdataset` + In xarray version 0.14 the default behaviour of `open_mfdataset` will change. To retain the existing behavior, pass combine='nested'. To use future default behavior, pass combine='by_coords'. See diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 7ee11052192..455b77907f9 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -68,12 +68,16 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): + __slots__ = () + def __array__(self, dtype=None): key = indexing.BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) class AbstractDataStore(Mapping): + __slots__ = () + def __iter__(self): return iter(self.variables) @@ -165,6 +169,8 @@ def __exit__(self, exception_type, exception_value, traceback): class ArrayWriter: + __slots__ = ("sources", "targets", "regions", "lock") + def __init__(self, lock=None): self.sources = [] self.targets = [] @@ -205,6 +211,8 @@ def sync(self, compute=True): class AbstractWritableDataStore(AbstractDataStore): + __slots__ = () + def encode(self, variables, attributes): """ Encode the variables and attributes in this store @@ -371,6 +379,8 @@ def set_dimensions(self, variables, unlimited_dims=None): class WritableCFDataStore(AbstractWritableDataStore): + __slots__ = () + def encode(self, variables, attributes): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index edc28c7b0ff..0c5fe9087d2 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -5,7 +5,7 @@ from .. import Variable from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error +from ..core.utils import FrozenOrderedDict from .common import WritableCFDataStore from .file_manager import CachingFileManager from .locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 9866a2fe344..813942c2f32 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -10,7 +10,7 @@ from .. import Variable, coding from ..coding.variables import pop_to from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error, is_remote_uri +from ..core.utils import FrozenOrderedDict, is_remote_uri from .common import ( BackendArray, WritableCFDataStore, @@ -30,6 +30,8 @@ class BaseNetCDF4Array(BackendArray): + __slots__ = ("datastore", "dtype", "shape", "variable_name") + def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -52,8 +54,13 @@ def __setitem__(self, key, value): if self.datastore.autoclose: self.datastore.close(needs_lock=False) + def get_array(self, needs_lock=True): + raise NotImplementedError("Virtual Method") + class NetCDF4ArrayWrapper(BaseNetCDF4Array): + __slots__ = () + def get_array(self, needs_lock=True): ds = self.datastore._acquire(needs_lock) variable = ds.variables[self.variable_name] @@ -294,6 +301,17 @@ class NetCDF4DataStore(WritableCFDataStore): This store supports NetCDF3, NetCDF4 and OpenDAP datasets. """ + __slots__ = ( + "autoclose", + "format", + "is_remote", + "lock", + "_filename", + "_group", + "_manager", + "_mode", + ) + def __init__( self, manager, group=None, mode=None, lock=NETCDF4_PYTHON_LOCK, autoclose=False ): diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 1d832d4f671..316f13470b7 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -322,11 +322,14 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc attrs["units"] = riods.units # Parse extra metadata from tags, if supported - parsers = {"ENVI": _parse_envi} + parsers = {"ENVI": _parse_envi, "GTiff": lambda m: m} driver = riods.driver if driver in parsers: - meta = parsers[driver](riods.tags(ns=driver)) + if driver == "GTiff": + meta = parsers[driver](riods.tags()) + else: + meta = parsers[driver](riods.tags(ns=driver)) for k, v in meta.items(): # Add values as coordinates if they match the band count, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 31997d258c8..9a115de55ef 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -29,6 +29,8 @@ def _encode_zarr_attr_value(value): class ZarrArrayWrapper(BackendArray): + __slots__ = ("datastore", "dtype", "shape", "variable_name") + def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -231,6 +233,15 @@ class ZarrStore(AbstractWritableDataStore): """Store for reading and writing data via zarr """ + __slots__ = ( + "append_dim", + "ds", + "_consolidate_on_close", + "_group", + "_read_only", + "_synchronizer", + ) + @classmethod def open_group( cls, diff --git a/xarray/conventions.py b/xarray/conventions.py index c15e5c40e73..1e40d254e96 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -31,6 +31,8 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): dtype('int16') """ + __slots__ = ("array",) + def __init__(self, array): self.array = indexing.as_indexable(array) @@ -60,6 +62,8 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): dtype('bool') """ + __slots__ = ("array",) + def __init__(self, array): self.array = indexing.as_indexable(array) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 03a6d37b01e..8838e71e6ca 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -75,6 +75,8 @@ class StringAccessor: """ + __slots__ = ("_obj",) + def __init__(self, obj): self._obj = obj diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index bb44f48fb9b..d63718500bc 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -1,9 +1,8 @@ import functools import operator -import warnings from collections import OrderedDict, defaultdict from contextlib import suppress -from typing import Any, Dict, Hashable, Mapping, Optional, Tuple, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Hashable, Mapping, Optional, Tuple, Union import numpy as np import pandas as pd @@ -14,8 +13,8 @@ from .variable import IndexVariable, Variable if TYPE_CHECKING: - from .dataarray import DataArray - from .dataset import Dataset + from .dataarray import DataArray # noqa: F401 + from .dataset import Dataset # noqa: F401 def _get_joiner(join): @@ -387,14 +386,9 @@ def reindex_variables( for dim, indexer in indexers.items(): if isinstance(indexer, DataArray) and indexer.dims != (dim,): - warnings.warn( + raise ValueError( "Indexer has dimensions {:s} that are different " - "from that to be indexed along {:s}. " - "This will behave differently in the future.".format( - str(indexer.dims), dim - ), - FutureWarning, - stacklevel=3, + "from that to be indexed along {:s}".format(str(indexer.dims), dim) ) target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim]) @@ -592,14 +586,3 @@ def broadcast(*args, exclude=None): result.append(_broadcast_helper(arg, exclude, dims_map, common_coords)) return tuple(result) - - -def broadcast_arrays(*args): - import warnings - - warnings.warn( - "xarray.broadcast_arrays is deprecated: use " "xarray.broadcast instead", - DeprecationWarning, - stacklevel=2, - ) - return broadcast(*args) diff --git a/xarray/core/arithmetic.py b/xarray/core/arithmetic.py index 5e8c8758ef5..137db034c95 100644 --- a/xarray/core/arithmetic.py +++ b/xarray/core/arithmetic.py @@ -14,6 +14,8 @@ class SupportsArithmetic: Used by Dataset, DataArray, Variable and GroupBy. """ + __slots__ = () + # TODO: implement special methods for arithmetic here rather than injecting # them in xarray/core/ops.py. Ideally, do so by inheriting from # numpy.lib.mixins.NDArrayOperatorsMixin. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 3aae12c3b66..be7fd86555c 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -5,10 +5,10 @@ import pandas as pd +from . import dtypes +from .concat import concat from .dataarray import DataArray from .dataset import Dataset -from .concat import concat -from . import dtypes from .merge import merge @@ -243,6 +243,7 @@ def _combine_1d( dim=concat_dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -351,7 +352,7 @@ def combine_nested( Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential merge conflicts: @@ -363,6 +364,7 @@ def combine_nested( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -504,7 +506,7 @@ def combine_by_coords( datasets : sequence of xarray.Dataset Dataset objects to combine. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -516,6 +518,7 @@ def combine_by_coords( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -598,6 +601,7 @@ def combine_by_coords( concat_dims=concat_dims, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -667,7 +671,7 @@ def auto_combine( component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: - 'broadcast_equals': all values must be equal when variables are @@ -678,6 +682,7 @@ def auto_combine( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' o list of str}, optional @@ -711,7 +716,7 @@ def auto_combine( if not from_openmfds: basic_msg = dedent( """\ - In xarray version 0.13 `auto_combine` will be deprecated. See + In xarray version 0.14 `auto_combine` will be deprecated. See http://xarray.pydata.org/en/stable/combining.html#combining-multi""" ) warnings.warn(basic_msg, FutureWarning, stacklevel=2) @@ -753,7 +758,7 @@ def auto_combine( message += dedent( """\ The datasets supplied require both concatenation and merging. From - xarray version 0.13 this will operation will require either using the + xarray version 0.14 this will operation will require either using the new `combine_nested` function (or the `combine='nested'` option to open_mfdataset), with a nested list structure such that you can combine along the dimensions {}. Alternatively if your datasets have global @@ -832,6 +837,7 @@ def _old_auto_combine( dim=dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -850,6 +856,7 @@ def _auto_concat( coords="different", fill_value=dtypes.NA, join="outer", + compat="no_conflicts", ): if len(datasets) == 1 and dim is None: # There is nothing more to combine, so kick out early. @@ -876,5 +883,10 @@ def _auto_concat( ) dim, = concat_dims return concat( - datasets, dim=dim, data_vars=data_vars, coords=coords, fill_value=fill_value + datasets, + dim=dim, + data_vars=data_vars, + coords=coords, + fill_value=fill_value, + compat=compat, ) diff --git a/xarray/core/common.py b/xarray/core/common.py index 2e834492521..ab9e7616ce1 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1,3 +1,4 @@ +import warnings from collections import OrderedDict from contextlib import suppress from textwrap import dedent @@ -35,6 +36,8 @@ class ImplementsArrayReduce: + __slots__ = () + @classmethod def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): if include_skipna: @@ -72,6 +75,8 @@ def wrapped_func(self, dim=None, axis=None, **kwargs): # type: ignore class ImplementsDatasetReduce: + __slots__ = () + @classmethod def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): if include_skipna: @@ -110,6 +115,8 @@ class AbstractArray(ImplementsArrayReduce): """Shared base class for DataArray and Variable. """ + __slots__ = () + def __bool__(self: Any) -> bool: return bool(self.values) @@ -180,7 +187,25 @@ class AttrAccessMixin: """Mixin class that allows getting keys with attribute access """ - _initialized = False + __slots__ = () + + def __init_subclass__(cls): + """Verify that all subclasses explicitly define ``__slots__``. If they don't, + raise error in the core xarray module and a FutureWarning in third-party + extensions. + This check is only triggered in Python 3.6+. + """ + if not hasattr(object.__new__(cls), "__dict__"): + cls.__setattr__ = cls._setattr_slots + elif cls.__module__.startswith("xarray."): + raise AttributeError("%s must explicitly define __slots__" % cls.__name__) + else: + cls.__setattr__ = cls._setattr_dict + warnings.warn( + "xarray subclass %s should explicitly define __slots__" % cls.__name__, + FutureWarning, + stacklevel=2, + ) @property def _attr_sources(self) -> List[Mapping[Hashable, Any]]: @@ -195,7 +220,7 @@ def _item_sources(self) -> List[Mapping[Hashable, Any]]: return [] def __getattr__(self, name: str) -> Any: - if name != "__setstate__": + if name not in {"__dict__", "__setstate__"}: # this avoids an infinite loop when pickle looks for the # __setstate__ attribute before the xarray object is initialized for source in self._attr_sources: @@ -205,20 +230,52 @@ def __getattr__(self, name: str) -> Any: "%r object has no attribute %r" % (type(self).__name__, name) ) - def __setattr__(self, name: str, value: Any) -> None: - if self._initialized: - try: - # Allow setting instance variables if they already exist - # (e.g., _attrs). We use __getattribute__ instead of hasattr - # to avoid key lookups with attribute-style access. - self.__getattribute__(name) - except AttributeError: - raise AttributeError( - "cannot set attribute %r on a %r object. Use __setitem__ " - "style assignment (e.g., `ds['name'] = ...`) instead to " - "assign variables." % (name, type(self).__name__) - ) + # This complicated three-method design boosts overall performance of simple + # operations - particularly DataArray methods that perform a _to_temp_dataset() + # round-trip - by a whopping 8% compared to a single method that checks + # hasattr(self, "__dict__") at runtime before every single assignment (like + # _setattr_py35 does). All of this is just temporary until the FutureWarning can be + # changed into a hard crash. + def _setattr_dict(self, name: str, value: Any) -> None: + """Deprecated third party subclass (see ``__init_subclass__`` above) + """ object.__setattr__(self, name, value) + if name in self.__dict__: + # Custom, non-slotted attr, or improperly assigned variable? + warnings.warn( + "Setting attribute %r on a %r object. Explicitly define __slots__ " + "to suppress this warning for legitimate custom attributes and " + "raise an error when attempting variables assignments." + % (name, type(self).__name__), + FutureWarning, + stacklevel=2, + ) + + def _setattr_slots(self, name: str, value: Any) -> None: + """Objects with ``__slots__`` raise AttributeError if you try setting an + undeclared attribute. This is desirable, but the error message could use some + improvement. + """ + try: + object.__setattr__(self, name, value) + except AttributeError as e: + # Don't accidentally shadow custom AttributeErrors, e.g. + # DataArray.dims.setter + if str(e) != "%r object has no attribute %r" % (type(self).__name__, name): + raise + raise AttributeError( + "cannot set attribute %r on a %r object. Use __setitem__ style" + "assignment (e.g., `ds['name'] = ...`) instead of assigning variables." + % (name, type(self).__name__) + ) from e + + def _setattr_py35(self, name: str, value: Any) -> None: + if hasattr(self, "__dict__"): + return self._setattr_dict(name, value) + return self._setattr_slots(name, value) + + # Overridden in Python >=3.6 by __init_subclass__ + __setattr__ = _setattr_py35 def __dir__(self) -> List[str]: """Provide method name lookup and completion. Only provide 'public' @@ -283,6 +340,8 @@ def get_squeeze_dims( class DataWithCoords(SupportsArithmetic, AttrAccessMixin): """Shared base class for Dataset and DataArray.""" + __slots__ = () + _rolling_exp_cls = RollingExp def squeeze( diff --git a/xarray/core/computation.py b/xarray/core/computation.py index cb3a0d5db7d..424ab5be87a 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -7,6 +7,7 @@ from collections import Counter, OrderedDict from distutils.version import LooseVersion from typing import ( + TYPE_CHECKING, AbstractSet, Any, Callable, @@ -17,7 +18,6 @@ Sequence, Tuple, Union, - TYPE_CHECKING, ) import numpy as np @@ -51,6 +51,14 @@ class _UFuncSignature: Core dimension names on each output variable. """ + __slots__ = ( + "input_core_dims", + "output_core_dims", + "_all_input_core_dims", + "_all_output_core_dims", + "_all_core_dims", + ) + def __init__(self, input_core_dims, output_core_dims=((),)): self.input_core_dims = tuple(tuple(a) for a in input_core_dims) self.output_core_dims = tuple(tuple(a) for a in output_core_dims) @@ -502,9 +510,10 @@ def broadcast_compat_data(variable, broadcast_dims, core_dims): missing_core_dims = [d for d in core_dims if d not in set_old_dims] if missing_core_dims: raise ValueError( - "operand to apply_ufunc has required core dimensions %r, but " - "some of these are missing on the input variable: %r" - % (list(core_dims), missing_core_dims) + "operand to apply_ufunc has required core dimensions {}, but " + "some of these dimensions are absent on an input variable: {}".format( + list(core_dims), missing_core_dims + ) ) set_new_dims = set(new_dims) @@ -648,7 +657,6 @@ def func(*arrays): def _apply_blockwise( func, args, input_dims, output_dims, signature, output_dtypes, output_sizes=None ): - import dask.array as da from .dask_array_compat import blockwise if signature.num_outputs > 1: diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 014b615f2a7..e68c247d880 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,24 +1,21 @@ -import warnings from collections import OrderedDict import pandas as pd -from . import utils, dtypes +from . import dtypes, utils from .alignment import align +from .merge import unique_variable, _VALID_COMPAT from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars def concat( objs, - dim=None, + dim, data_vars="all", coords="different", compat="equals", positions=None, - indexers=None, - mode=None, - concat_over=None, fill_value=dtypes.NA, join="outer", ): @@ -63,12 +60,19 @@ def concat( those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition to the 'minimal' coordinates. - compat : {'equals', 'identical'}, optional - String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. 'equals' means - that all variable values and dimensions must be the same; - 'identical' means that variable attributes and global attributes - must also be equal. + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional + String indicating how to compare non-concatenated variables of the same name for + potential conflicts. This is passed down to merge. + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + - 'override': skip comparing and pick variable from first dataset positions : None or list of integer arrays, optional List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not @@ -111,36 +115,10 @@ def concat( except StopIteration: raise ValueError("must supply at least one object to concatenate") - if dim is None: - warnings.warn( - "the `dim` argument to `concat` will be required " - "in a future version of xarray; for now, setting it to " - "the old default of 'concat_dim'", - FutureWarning, - stacklevel=2, - ) - dim = "concat_dims" - - if indexers is not None: # pragma: no cover - warnings.warn( - "indexers has been renamed to positions; the alias " - "will be removed in a future version of xarray", - FutureWarning, - stacklevel=2, - ) - positions = indexers - - if mode is not None: - raise ValueError( - "`mode` is no longer a valid argument to " - "xarray.concat; it has been split into the " - "`data_vars` and `coords` arguments" - ) - if concat_over is not None: + if compat not in _VALID_COMPAT: raise ValueError( - "`concat_over` is no longer a valid argument to " - "xarray.concat; it has been split into the " - "`data_vars` and `coords` arguments" + "compat=%r invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" + % compat ) if isinstance(first_obj, DataArray): @@ -179,23 +157,39 @@ def _calc_concat_dim_coord(dim): return dim, coord -def _calc_concat_over(datasets, dim, data_vars, coords): +def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): """ Determine which dataset variables need to be concatenated in the result, - and which can simply be taken from the first dataset. """ # Return values concat_over = set() equals = {} - if dim in datasets[0]: + if dim in dim_names: + concat_over_existing_dim = True concat_over.add(dim) + else: + concat_over_existing_dim = False + + concat_dim_lengths = [] for ds in datasets: + if concat_over_existing_dim: + if dim not in ds.dims: + if dim in ds: + ds = ds.set_coords(dim) + else: + raise ValueError("%r is not present in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) + concat_dim_lengths.append(ds.dims.get(dim, 1)) def process_subset_opt(opt, subset): if isinstance(opt, str): if opt == "different": + if compat == "override": + raise ValueError( + "Cannot specify both %s='different' and compat='override'." + % subset + ) # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): if k not in concat_over: @@ -209,7 +203,7 @@ def process_subset_opt(opt, subset): for ds_rhs in datasets[1:]: v_rhs = ds_rhs.variables[k].compute() computed.append(v_rhs) - if not v_lhs.equals(v_rhs): + if not getattr(v_lhs, compat)(v_rhs): concat_over.add(k) equals[k] = False # computed variables are not to be re-computed @@ -245,7 +239,29 @@ def process_subset_opt(opt, subset): process_subset_opt(data_vars, "data_vars") process_subset_opt(coords, "coords") - return concat_over, equals + return concat_over, equals, concat_dim_lengths + + +# determine dimensional coordinate names and a dict mapping name to DataArray +def _parse_datasets(datasets): + + dims = set() + all_coord_names = set() + data_vars = set() # list of data_vars + dim_coords = dict() # maps dim name to variable + dims_sizes = {} # shared dimension sizes to expand variables + + for ds in datasets: + dims_sizes.update(ds.dims) + all_coord_names.update(ds.coords) + data_vars.update(ds.data_vars) + + for dim in set(ds.dims) - dims: + if dim not in dim_coords: + dim_coords[dim] = ds.coords[dim].variable + dims = dims | set(ds.dims) + + return dim_coords, dims_sizes, all_coord_names, data_vars def _dataset_concat( @@ -263,11 +279,6 @@ def _dataset_concat( """ from .dataset import Dataset - if compat not in ["equals", "identical"]: - raise ValueError( - "compat=%r invalid: must be 'equals' " "or 'identical'" % compat - ) - dim, coord = _calc_concat_dim_coord(dim) # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] @@ -275,62 +286,65 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) + dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) + dim_names = set(dim_coords) + unlabeled_dims = dim_names - coord_names - def insert_result_variable(k, v): - assert isinstance(v, Variable) - if k in datasets[0].coords: - result_coord_names.add(k) - result_vars[k] = v + both_data_and_coords = coord_names & data_names + if both_data_and_coords: + raise ValueError( + "%r is a coordinate in some datasets but not others." % both_data_and_coords + ) + # we don't want the concat dimension in the result dataset yet + dim_coords.pop(dim, None) + dims_sizes.pop(dim, None) + + # case where concat dimension is a coordinate or data_var but not a dimension + if (dim in coord_names or dim in data_names) and dim not in dim_names: + datasets = [ds.expand_dims(dim) for ds in datasets] + + # determine which variables to concatentate + concat_over, equals, concat_dim_lengths = _calc_concat_over( + datasets, dim, dim_names, data_vars, coords, compat + ) + + # determine which variables to merge, and then merge them according to compat + variables_to_merge = (coord_names | data_names) - concat_over - dim_names + + result_vars = {} + if variables_to_merge: + to_merge = {var: [] for var in variables_to_merge} + + for ds in datasets: + absent_merge_vars = variables_to_merge - set(ds.variables) + if absent_merge_vars: + raise ValueError( + "variables %r are present in some datasets but not others. " + % absent_merge_vars + ) - # create the new dataset and add constant variables - result_vars = OrderedDict() - result_coord_names = set(datasets[0].coords) + for var in variables_to_merge: + to_merge[var].append(ds.variables[var]) + + for var in variables_to_merge: + result_vars[var] = unique_variable( + var, to_merge[var], compat=compat, equals=equals.get(var, None) + ) + else: + result_vars = OrderedDict() + result_vars.update(dim_coords) + + # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding - for k, v in datasets[0].variables.items(): - if k not in concat_over: - insert_result_variable(k, v) - - # check that global attributes and non-concatenated variables are fixed - # across all datasets + # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): - raise ValueError("dataset global attributes not equal") - for k, v in ds.variables.items(): - if k not in result_vars and k not in concat_over: - raise ValueError("encountered unexpected variable %r" % k) - elif (k in result_coord_names) != (k in ds.coords): - raise ValueError( - "%r is a coordinate in some datasets but not " "others" % k - ) - elif k in result_vars and k != dim: - # Don't use Variable.identical as it internally invokes - # Variable.equals, and we may already know the answer - if compat == "identical" and not utils.dict_equiv( - v.attrs, result_vars[k].attrs - ): - raise ValueError("variable %s not identical across datasets" % k) - - # Proceed with equals() - try: - # May be populated when using the "different" method - is_equal = equals[k] - except KeyError: - result_vars[k].load() - is_equal = v.equals(result_vars[k]) - if not is_equal: - raise ValueError("variable %s not equal across datasets" % k) + raise ValueError("Dataset global attributes not equal.") # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables - dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] - non_concat_dims = {} - for ds in datasets: - non_concat_dims.update(ds.dims) - non_concat_dims.pop(dim, None) - def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the @@ -338,25 +352,27 @@ def ensure_common_dims(vars): common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) if dim not in common_dims: common_dims = (dim,) + common_dims - for var, dim_len in zip(vars, dim_lengths): + for var, dim_len in zip(vars, concat_dim_lengths): if var.dims != common_dims: - common_shape = tuple( - non_concat_dims.get(d, dim_len) for d in common_dims - ) + common_shape = tuple(dims_sizes.get(d, dim_len) for d in common_dims) var = var.set_dims(common_dims, common_shape) yield var # stack up each variable to fill-out the dataset (in order) + # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: vars = ensure_common_dims([ds.variables[k] for ds in datasets]) combined = concat_vars(vars, dim, positions) - insert_result_variable(k, combined) + assert isinstance(combined, Variable) + result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) + result = result.set_coords(coord_names) result.encoding = result_encoding + result = result.drop(unlabeled_dims, errors="ignore") + if coord is not None: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord @@ -378,7 +394,7 @@ def _dataarray_concat( if data_vars != "all": raise ValueError( - "data_vars is not a valid argument when " "concatenating DataArray objects" + "data_vars is not a valid argument when concatenating DataArray objects" ) datasets = [] diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 562d30dd6c7..ddea5739fff 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -4,12 +4,12 @@ TYPE_CHECKING, Any, Hashable, - Mapping, Iterator, - Union, + Mapping, + Sequence, Set, Tuple, - Sequence, + Union, cast, ) @@ -35,7 +35,7 @@ class AbstractCoordinates(Mapping[Hashable, "DataArray"]): - _data = None # type: Union["DataArray", "Dataset"] + __slots__ = () def __getitem__(self, key: Hashable) -> "DataArray": raise NotImplementedError() @@ -53,7 +53,7 @@ def dims(self) -> Union[Mapping[Hashable, int], Tuple[Hashable, ...]]: @property def indexes(self) -> Indexes: - return self._data.indexes + return self._data.indexes # type: ignore @property def variables(self): @@ -108,9 +108,9 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: raise ValueError("no valid index for a 0-dimensional object") elif len(ordered_dims) == 1: (dim,) = ordered_dims - return self._data.get_index(dim) + return self._data.get_index(dim) # type: ignore else: - indexes = [self._data.get_index(k) for k in ordered_dims] + indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore names = list(ordered_dims) return pd.MultiIndex.from_product(indexes, names=names) @@ -187,7 +187,7 @@ class DatasetCoordinates(AbstractCoordinates): objects. """ - _data = None # type: Dataset + __slots__ = ("_data",) def __init__(self, dataset: "Dataset"): self._data = dataset @@ -258,7 +258,7 @@ class DataArrayCoordinates(AbstractCoordinates): dimensions and the values given by corresponding DataArray objects. """ - _data = None # type: DataArray + __slots__ = ("_data",) def __init__(self, dataarray: "DataArray"): self._data = dataarray @@ -314,6 +314,8 @@ class LevelCoordinatesSource(Mapping[Hashable, Any]): by any public methods. """ + __slots__ = ("_data",) + def __init__(self, data_object: "Union[DataArray, Dataset]"): self._data = data_object diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py index 5d4ff849b57..fe2cdc5c553 100644 --- a/xarray/core/dask_array_compat.py +++ b/xarray/core/dask_array_compat.py @@ -4,7 +4,6 @@ import numpy as np from dask import __version__ as dask_version - try: blockwise = da.blockwise except AttributeError: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 52c11429e2b..e63b6c9975f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4,6 +4,7 @@ from collections import OrderedDict from numbers import Number from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -17,7 +18,6 @@ Union, cast, overload, - TYPE_CHECKING, ) import numpy as np @@ -38,9 +38,9 @@ from .accessor_dt import DatetimeAccessor from .accessor_str import StringAccessor from .alignment import ( - align, _broadcast_helper, _get_broadcast_dims_map_common_coords, + align, reindex_like_indexers, ) from .common import AbstractArray, DataWithCoords @@ -54,7 +54,7 @@ from .formatting import format_item from .indexes import Indexes, default_indexes from .options import OPTIONS -from .utils import _check_inplace, either_dict_or_kwargs, ReprObject +from .utils import ReprObject, _check_inplace, either_dict_or_kwargs from .variable import ( IndexVariable, Variable, @@ -113,6 +113,11 @@ def _infer_coords_and_dims( coord = as_variable(coord, name=dims[n]).to_index_variable() dims[n] = coord.name dims = tuple(dims) + elif len(dims) != len(shape): + raise ValueError( + "different number of dimensions on data " + "and dims: %s vs %s" % (len(shape), len(dims)) + ) else: for d in dims: if not isinstance(d, str): @@ -158,7 +163,27 @@ def _infer_coords_and_dims( return new_coords, dims +def _check_data_shape(data, coords, dims): + if data is dtypes.NA: + data = np.nan + if coords is not None and utils.is_scalar(data, include_0d=False): + if utils.is_dict_like(coords): + if dims is None: + return data + else: + data_shape = tuple( + as_variable(coords[k], k).size if k in coords.keys() else 1 + for k in dims + ) + else: + data_shape = tuple(as_variable(coord, "foo").size for coord in coords) + data = np.full(data_shape, data) + return data + + class _LocIndexer: + __slots__ = ("data_array",) + def __init__(self, data_array: "DataArray"): self.data_array = data_array @@ -223,6 +248,16 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ + __slots__ = ( + "_accessors", + "_coords", + "_file_obj", + "_name", + "_indexes", + "_variable", + "__weakref__", + ) + _groupby_cls = groupby.DataArrayGroupBy _rolling_cls = rolling.DataArrayRolling _coarsen_cls = rolling.DataArrayCoarsen @@ -234,7 +269,7 @@ class DataArray(AbstractArray, DataWithCoords): def __init__( self, - data: Any, + data: Any = dtypes.NA, coords: Union[Sequence[Tuple], Mapping[Hashable, Any], None] = None, dims: Union[Hashable, Sequence[Hashable], None] = None, name: Hashable = None, @@ -288,7 +323,7 @@ def __init__( if encoding is not None: warnings.warn( "The `encoding` argument to `DataArray` is deprecated, and . " - "will be removed in 0.13. " + "will be removed in 0.14. " "Instead, specify the encoding when writing to disk or " "set the `encoding` attribute directly.", FutureWarning, @@ -323,6 +358,7 @@ def __init__( if encoding is None: encoding = getattr(data, "encoding", None) + data = _check_data_shape(data, coords, dims) data = as_compatible_data(data) coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, encoding, fastpath=True) @@ -332,6 +368,7 @@ def __init__( assert isinstance(coords, OrderedDict) self._coords = coords # type: OrderedDict[Any, Variable] self._name = name # type: Optional[Hashable] + self._accessors = None # type: Optional[Dict[str, Any]] # TODO(shoyer): document this argument, once it becomes part of the # public interface. @@ -339,8 +376,6 @@ def __init__( self._file_obj = None - self._initialized = True # type: bool - def _replace( self, variable: Variable = None, @@ -444,7 +479,7 @@ def _to_dataset_whole( dataset = Dataset._from_vars_and_coord_names(variables, coord_names) return dataset - def to_dataset(self, dim: Hashable = None, name: Hashable = None) -> Dataset: + def to_dataset(self, dim: Hashable = None, *, name: Hashable = None) -> Dataset: """Convert a DataArray to a Dataset. Parameters @@ -462,15 +497,9 @@ def to_dataset(self, dim: Hashable = None, name: Hashable = None) -> Dataset: dataset : Dataset """ if dim is not None and dim not in self.dims: - warnings.warn( - "the order of the arguments on DataArray.to_dataset " - "has changed; you now need to supply ``name`` as " - "a keyword argument", - FutureWarning, - stacklevel=2, + raise TypeError( + "{} is not a dim. If supplying a ``name``, pass as a kwarg.".format(dim) ) - name = dim - dim = None if dim is not None: if name is not None: @@ -700,34 +729,21 @@ def reset_coords( drop : bool, optional If True, remove coordinates instead of converting them into variables. - inplace : bool, optional - If True, modify this object in place. Otherwise, create a new - object. Returns ------- - Dataset, or DataArray if ``drop == True``, or None if - ``inplace == True`` + Dataset, or DataArray if ``drop == True`` """ - inplace = _check_inplace(inplace) - if inplace and not drop: - raise ValueError( - "cannot reset coordinates in-place on a " - "DataArray without ``drop == True``" - ) + _check_inplace(inplace) if names is None: names = set(self.coords) - set(self.dims) dataset = self.coords.to_dataset().reset_coords(names, drop) if drop: - if inplace: - self._coords = dataset._variables - return None - else: - return self._replace(coords=dataset._variables) + return self._replace(coords=dataset._variables) else: if self.name is None: raise ValueError( - "cannot reset_coords with drop=False " "on an unnamed DataArrray" + "cannot reset_coords with drop=False on an unnamed DataArrray" ) dataset[self.name] = self.variable return dataset @@ -1026,30 +1042,55 @@ def sel( ) return self._from_temp_dataset(ds) - def isel_points(self, dim="points", **indexers) -> "DataArray": - """Return a new DataArray whose data is given by pointwise integer - indexing along the specified dimension(s). + def head( + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any + ) -> "DataArray": + """Return a new DataArray whose data is given by the the first `n` + values along the specified dimension(s). Default `n` = 5 See Also -------- - Dataset.isel_points + Dataset.head + DataArray.tail + DataArray.thin """ - ds = self._to_temp_dataset().isel_points(dim=dim, **indexers) + ds = self._to_temp_dataset().head(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) - def sel_points( - self, dim="points", method=None, tolerance=None, **indexers + def tail( + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any ) -> "DataArray": - """Return a new DataArray whose dataset is given by pointwise selection - of index labels along the specified dimension(s). + """Return a new DataArray whose data is given by the the last `n` + values along the specified dimension(s). Default `n` = 5 See Also -------- - Dataset.sel_points + Dataset.tail + DataArray.head + DataArray.thin """ - ds = self._to_temp_dataset().sel_points( - dim=dim, method=method, tolerance=tolerance, **indexers - ) + ds = self._to_temp_dataset().tail(indexers, **indexers_kwargs) + return self._from_temp_dataset(ds) + + def thin( + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any + ) -> "DataArray": + """Return a new DataArray whose data is given by each `n` value + along the specified dimension(s). Default `n` = 5 + + See Also + -------- + Dataset.thin + DataArray.head + DataArray.tail + """ + ds = self._to_temp_dataset().thin(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) def broadcast_like( @@ -1412,9 +1453,26 @@ def swap_dims(self, dims_dict: Mapping[Hashable, Hashable]) -> "DataArray": Returns ------- - renamed : Dataset + swapped : DataArray DataArray with swapped dimensions. + Examples + -------- + >>> arr = xr.DataArray(data=[0, 1], dims="x", + coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + >>> arr + + array([0, 1]) + Coordinates: + * x (x) >> arr.swap_dims({"x": "y"}) + + array([0, 1]) + Coordinates: + x (y) >> arr = xr.DataArray(data=np.ones((2, 3)), ... dims=['x', 'y'], ... coords={'x': @@ -1552,14 +1604,10 @@ def set_index( -------- DataArray.reset_index """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) indexes = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") coords, _ = merge_indexes(indexes, self._coords, set(), append=append) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def reset_index( self, @@ -1577,36 +1625,29 @@ def reset_index( drop : bool, optional If True, remove the specified indexes and/or multi-index levels instead of extracting them as new coordinates (default: False). - inplace : bool, optional - If True, modify the dataarray in-place. Otherwise, return a new - DataArray object. Returns ------- obj : DataArray Another dataarray, with this dataarray's data but replaced - coordinates. If ``inplace == True``, return None. + coordinates. See Also -------- DataArray.set_index """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) coords, _ = split_indexes( dims_or_levels, self._coords, set(), self._level_coords, drop=drop ) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def reorder_levels( self, dim_order: Mapping[Hashable, Sequence[int]] = None, inplace: bool = None, **dim_order_kwargs: Sequence[int] - ) -> Optional["DataArray"]: + ) -> "DataArray": """Rearrange index levels using input order. Parameters @@ -1615,9 +1656,6 @@ def reorder_levels( Mapping from names matching dimensions and values given by lists representing new level orders. Every given dimension must have a multi-index. - inplace : bool, optional - If True, modify the dataarray in-place. Otherwise, return a new - DataArray object. **dim_order_kwargs: optional The keyword arguments form of ``dim_order``. One of dim_order or dim_order_kwargs must be provided. @@ -1626,9 +1664,9 @@ def reorder_levels( ------- obj : DataArray Another dataarray, with this dataarray's data but replaced - coordinates. If ``inplace == True``, return None. + coordinates. """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) dim_order = either_dict_or_kwargs(dim_order, dim_order_kwargs, "reorder_levels") replace_coords = {} for dim, order in dim_order.items(): @@ -1639,11 +1677,7 @@ def reorder_levels( replace_coords[dim] = IndexVariable(coord.dims, index.reorder_levels(order)) coords = self._coords.copy() coords.update(replace_coords) - if inplace: - self._coords = coords - return None - else: - return self._replace(coords=coords) + return self._replace(coords=coords) def stack( self, @@ -2319,19 +2353,27 @@ def from_dict(cls, d: dict) -> "DataArray": return obj @classmethod - def from_series(cls, series: pd.Series) -> "DataArray": + def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray": """Convert a pandas.Series into an xarray.DataArray. If the series's index is a MultiIndex, it will be expanded into a tensor product of one-dimensional coordinates (filling in missing values with NaN). Thus this operation should be the inverse of the `to_series` method. + + If sparse=True, creates a sparse array instead of a dense NumPy array. + Requires the pydata/sparse package. + + See also + -------- + xarray.Dataset.from_dataframe """ - # TODO: add a 'name' parameter - name = series.name - df = pd.DataFrame({name: series}) - ds = Dataset.from_dataframe(df) - return ds[name] + temp_name = "__temporary_name" + df = pd.DataFrame({temp_name: series}) + ds = Dataset.from_dataframe(df, sparse=sparse) + result = cast(DataArray, ds[temp_name]) + result.name = series.name + return result def to_cdms2(self) -> "cdms2_Variable": """Convert this array into a cdms2.Variable @@ -2516,10 +2558,12 @@ def plot(self) -> _PlotMethods: >>> d = DataArray([[1, 2], [3, 4]]) For convenience just call this directly + >>> d.plot() Or use it as a namespace to use xarray.plot functions as DataArray methods + >>> d.plot.imshow() # equivalent to xarray.plot.imshow(d) """ @@ -2746,7 +2790,7 @@ def dot( """ if isinstance(other, Dataset): raise NotImplementedError( - "dot products are not yet supported " "with Dataset objects." + "dot products are not yet supported with Dataset objects." ) if not isinstance(other, DataArray): raise TypeError("dot only operates on DataArrays.") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 076b97e8623..ea087ce3ce1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7,6 +7,7 @@ from numbers import Number from pathlib import Path from typing import ( + TYPE_CHECKING, Any, Callable, DefaultDict, @@ -24,28 +25,28 @@ Union, cast, overload, - TYPE_CHECKING, ) import numpy as np import pandas as pd + import xarray as xr from ..coding.cftimeindex import _parse_array_of_cftime_strings +from ..plot.dataset_plot import _Dataset_PlotMethods from . import ( alignment, dtypes, duck_array_ops, formatting, groupby, - indexing, ops, pdcompat, resample, rolling, utils, ) -from .alignment import align, _broadcast_helper, _get_broadcast_dims_map_common_coords +from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .common import ( ALL_DIMS, DataWithCoords, @@ -54,7 +55,6 @@ ) from .coordinates import ( DatasetCoordinates, - DataArrayCoordinates, LevelCoordinatesSource, assert_coordinate_consistent, remap_label_indexers, @@ -77,9 +77,10 @@ either_dict_or_kwargs, hashable, maybe_wrap_array, + is_dict_like, + is_list_like, ) from .variable import IndexVariable, Variable, as_variable, broadcast_variables -from ..plot.dataset_plot import _Dataset_PlotMethods if TYPE_CHECKING: from ..backends import AbstractDataStore, ZarrStore @@ -344,6 +345,8 @@ def as_dataset(obj: Any) -> "Dataset": class DataVariables(Mapping[Hashable, "DataArray"]): + __slots__ = ("_dataset",) + def __init__(self, dataset: "Dataset"): self._dataset = dataset @@ -383,6 +386,8 @@ def _ipython_key_completions_(self): class _LocIndexer: + __slots__ = ("dataset",) + def __init__(self, dataset: "Dataset"): self.dataset = dataset @@ -406,6 +411,18 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): coordinates used for label based indexing. """ + __slots__ = ( + "_accessors", + "_attrs", + "_coord_names", + "_dims", + "_encoding", + "_file_obj", + "_indexes", + "_variables", + "__weakref__", + ) + _groupby_cls = groupby.DatasetGroupBy _rolling_cls = rolling.DatasetRolling _coarsen_cls = rolling.DatasetCoarsen @@ -473,7 +490,7 @@ def __init__( if compat is not None: warnings.warn( "The `compat` argument to Dataset is deprecated and will be " - "removed in 0.13." + "removed in 0.14." "Instead, use `merge` to control how variables are combined", FutureWarning, stacklevel=2, @@ -484,6 +501,7 @@ def __init__( self._variables = OrderedDict() # type: OrderedDict[Any, Variable] self._coord_names = set() # type: Set[Hashable] self._dims = {} # type: Dict[Any, int] + self._accessors = None # type: Optional[Dict[str, Any]] self._attrs = None # type: Optional[OrderedDict] self._file_obj = None if data_vars is None: @@ -499,7 +517,6 @@ def __init__( self._attrs = OrderedDict(attrs) self._encoding = None # type: Optional[Dict] - self._initialized = True def _set_init_vars_and_dims(self, data_vars, coords, compat): """Set the initial value of Dataset variables and dimensions @@ -838,7 +855,7 @@ def _construct_direct( obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding - obj._initialized = True + obj._accessors = None return obj __default = object() @@ -1213,12 +1230,13 @@ def loc(self) -> _LocIndexer: """ return _LocIndexer(self) - def __getitem__(self, key: object) -> "Union[DataArray, Dataset]": + def __getitem__(self, key: Any) -> "Union[DataArray, Dataset]": """Access variables or coordinates this dataset as a :py:class:`~xarray.DataArray`. Indexing with a list of names will return a new ``Dataset`` object. """ + # TODO(shoyer): type this properly: https://github.com/python/mypy/issues/7328 if utils.is_dict_like(key): return self.isel(**cast(Mapping, key)) @@ -1353,9 +1371,6 @@ def set_coords( ---------- names : hashable or iterable of hashables Name(s) of variables in this dataset to convert into coordinates. - inplace : bool, optional - If True, modify this dataset inplace. Otherwise, create a new - object. Returns ------- @@ -1369,13 +1384,13 @@ def set_coords( # DataFrame.set_index? # nb. check in self._variables, not self.data_vars to insure that the # operation is idempotent - inplace = _check_inplace(inplace) + _check_inplace(inplace) if isinstance(names, str) or not isinstance(names, Iterable): names = [names] else: names = list(names) self._assert_all_in_dataset(names) - obj = self if inplace else self.copy() + obj = self.copy() obj._coord_names.update(names) return obj @@ -1395,15 +1410,12 @@ def reset_coords( drop : bool, optional If True, remove coordinates instead of converting them into variables. - inplace : bool, optional - If True, modify this dataset inplace. Otherwise, create a new - object. Returns ------- Dataset """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) if names is None: names = self._coord_names - set(self.dims) else: @@ -1417,7 +1429,7 @@ def reset_coords( raise ValueError( "cannot remove index coordinates with reset_coords: %s" % bad_coords ) - obj = self if inplace else self.copy() + obj = self.copy() obj._coord_names.difference_update(names) if drop: for name in names: @@ -1769,7 +1781,7 @@ def _validate_indexers( elif isinstance(v, Dataset): raise TypeError("cannot use a Dataset as an indexer") elif isinstance(v, Sequence) and len(v) == 0: - v = IndexVariable((k,), np.zeros((0,), dtype="int64")) + v = Variable((k,), np.zeros((0,), dtype="int64")) else: v = np.asarray(v) @@ -1783,16 +1795,13 @@ def _validate_indexers( if v.ndim == 0: v = Variable((), v) elif v.ndim == 1: - v = IndexVariable((k,), v) + v = Variable((k,), v) else: raise IndexError( "Unlabeled multi-dimensional array cannot be " "used for indexing: {}".format(k) ) - if v.ndim == 1: - v = v.to_index_variable() - indexers_list.append((k, v)) return indexers_list @@ -1997,213 +2006,152 @@ def sel( result = self.isel(indexers=pos_indexers, drop=drop) return result._overwrite_indexes(new_indexes) - def isel_points(self, dim: Any = "points", **indexers: Any) -> "Dataset": - """Returns a new dataset with each array indexed pointwise along the - specified dimension(s). - - This method selects pointwise values from each array and is akin to - the NumPy indexing behavior of `arr[[0, 1], [0, 1]]`, except this - method does not require knowing the order of each array's dimensions. + def head( + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with the first `n` values of each array + for the specified dimension(s). Parameters ---------- - dim : hashable or DataArray or pandas.Index or other list-like object, - optional - Name of the dimension to concatenate along. If dim is provided as a - hashable, it must be a new dimension name, in which case it is added - along axis=0. If dim is provided as a DataArray or Index or - list-like object, its name, which must not be present in the - dataset, is used as the dimension to concatenate along and the - values are added as a coordinate. - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by array-like objects. All indexers must be the same length and - 1 dimensional. + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. - Returns - ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - array and dimension is indexed by the appropriate indexers. With - pointwise indexing, the new Dataset will always be a copy of the - original. See Also -------- - Dataset.sel - Dataset.isel - Dataset.sel_points - DataArray.isel_points - """ # noqa - warnings.warn( - "Dataset.isel_points is deprecated: use Dataset.isel()" "instead.", - DeprecationWarning, - stacklevel=2, - ) - - indexer_dims = set(indexers) - - def take(variable, slices): - # Note: remove helper function when once when numpy - # supports vindex https://github.com/numpy/numpy/pull/6075 - if hasattr(variable.data, "vindex"): - # Special case for dask backed arrays to use vectorised list - # indexing - sel = variable.data.vindex[slices] - else: - # Otherwise assume backend is numpy array with 'fancy' indexing - sel = variable.data[slices] - return sel - - def relevant_keys(mapping): - return [ - k for k, v in mapping.items() if any(d in indexer_dims for d in v.dims) - ] - - coords = relevant_keys(self.coords) - indexers = {k: np.asarray(v) for k, v in indexers.items()} - non_indexed_dims = set(self.dims) - indexer_dims - non_indexed_coords = set(self.coords) - set(coords) - - # All the indexers should be iterables - # Check that indexers are valid dims, integers, and 1D + Dataset.tail + Dataset.thin + DataArray.head + """ + if not indexers_kwargs: + if indexers is None: + indexers = 5 + if not isinstance(indexers, int) and not is_dict_like(indexers): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "head") for k, v in indexers.items(): - if k not in self.dims: - raise ValueError("dimension %s does not exist" % k) - if v.dtype.kind != "i": # type: ignore - raise TypeError("Indexers must be integers") - if v.ndim != 1: # type: ignore - raise ValueError("Indexers must be 1 dimensional") - - # all the indexers should have the same length - lengths = {len(v) for k, v in indexers.items()} - if len(lengths) > 1: - raise ValueError("All indexers must be the same length") - - # Existing dimensions are not valid choices for the dim argument - if isinstance(dim, str): - if dim in self.dims: - # dim is an invalid string - raise ValueError( - "Existing dimension names are not valid " - "choices for the dim argument in sel_points" + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) ) - - elif hasattr(dim, "dims"): - # dim is a DataArray or Coordinate - if dim.name in self.dims: - # dim already exists + elif v < 0: raise ValueError( - "Existing dimensions are not valid choices " - "for the dim argument in sel_points" + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) ) + indexers_slices = {k: slice(val) for k, val in indexers.items()} + return self.isel(indexers_slices) - # Set the new dim_name, and optionally the new dim coordinate - # dim is either an array-like or a string - if not utils.is_scalar(dim): - # dim is array like get name or assign 'points', get as variable - dim_name = "points" if not hasattr(dim, "name") else dim.name - dim_coord = as_variable(dim, name=dim_name) - else: - # dim is a string - dim_name = dim - dim_coord = None # type: ignore - - reordered = self.transpose(*list(indexer_dims), *list(non_indexed_dims)) - - variables = OrderedDict() # type: ignore - - for name, var in reordered.variables.items(): - if name in indexers or any(d in indexer_dims for d in var.dims): - # slice if var is an indexer or depends on an indexed dim - slc = [indexers.get(k, slice(None)) for k in var.dims] + def tail( + self, + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with the last `n` values of each array + for the specified dimension(s). - var_dims = [dim_name] + [d for d in var.dims if d in non_indexed_dims] - selection = take(var, tuple(slc)) - var_subset = type(var)(var_dims, selection, var.attrs) - variables[name] = var_subset - else: - # If not indexed just add it back to variables or coordinates - variables[name] = var + Parameters + ---------- + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. - coord_names = (set(coords) & set(variables)) | non_indexed_coords - dset = self._replace_vars_and_dims(variables, coord_names=coord_names) - # Add the dim coord to the new dset. Must be done after creation - # because_replace_vars_and_dims can only access existing coords, - # not add new ones - if dim_coord is not None: - dset.coords[dim_name] = dim_coord - return dset + See Also + -------- + Dataset.head + Dataset.thin + DataArray.tail + """ + if not indexers_kwargs: + if indexers is None: + indexers = 5 + if not isinstance(indexers, int) and not is_dict_like(indexers): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "tail") + for k, v in indexers.items(): + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) + ) + elif v < 0: + raise ValueError( + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) + ) + indexers_slices = { + k: slice(-val, None) if val != 0 else slice(val) + for k, val in indexers.items() + } + return self.isel(indexers_slices) - def sel_points( + def thin( self, - dim: Any = "points", - method: str = None, - tolerance: Number = None, - **indexers: Any - ): - """Returns a new dataset with each array indexed pointwise by tick - labels along the specified dimension(s). - - In contrast to `Dataset.isel_points`, indexers for this method should - use labels instead of integers. - - In contrast to `Dataset.sel`, this method selects points along the - diagonal of multi-dimensional arrays, not the intersection. + indexers: Union[Mapping[Hashable, int], int] = None, + **indexers_kwargs: Any + ) -> "Dataset": + """Returns a new dataset with each array indexed along every `n`th + value for the specified dimension(s) Parameters ---------- - dim : hashable or DataArray or pandas.Index or other list-like object, - optional - Name of the dimension to concatenate along. If dim is provided as a - hashable, it must be a new dimension name, in which case it is added - along axis=0. If dim is provided as a DataArray or Index or - list-like object, its name, which must not be present in the - dataset, is used as the dimension to concatenate along and the - values are added as a coordinate. - method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional - Method to use for inexact matches (requires pandas>=0.16): - - * None (default): only exact matches - * pad / ffill: propagate last valid index value forward - * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations must - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by array-like objects. All indexers must be the same length and - 1 dimensional. + indexers : dict or int, default: 5 + A dict with keys matching dimensions and integer values `n` + or a single integer `n` applied over all dimensions. + One of indexers or indexers_kwargs must be provided. + **indexers_kwargs : {dim: n, ...}, optional + The keyword arguments form of ``indexers``. + One of indexers or indexers_kwargs must be provided. - Returns - ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - array and dimension is indexed by the appropriate indexers. With - pointwise indexing, the new Dataset will always be a copy of the - original. See Also -------- - Dataset.sel - Dataset.isel - Dataset.isel_points - DataArray.sel_points - """ # noqa - warnings.warn( - "Dataset.sel_points is deprecated: use Dataset.sel()" "instead.", - DeprecationWarning, - stacklevel=2, - ) - - pos_indexers, _ = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance - ) - return self.isel_points(dim=dim, **pos_indexers) + Dataset.head + Dataset.tail + DataArray.thin + """ + if ( + not indexers_kwargs + and not isinstance(indexers, int) + and not is_dict_like(indexers) + ): + raise TypeError("indexers must be either dict-like or a single integer") + if isinstance(indexers, int): + indexers = {dim: indexers for dim in self.dims} + indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "thin") + for k, v in indexers.items(): + if not isinstance(v, int): + raise TypeError( + "expected integer type indexer for " + "dimension %r, found %r" % (k, type(v)) + ) + elif v < 0: + raise ValueError( + "expected positive integer as indexer " + "for dimension %r, found %s" % (k, v) + ) + elif v == 0: + raise ValueError("step cannot be zero") + indexers_slices = {k: slice(None, None, val) for k, val in indexers.items()} + return self.isel(indexers_slices) def broadcast_like( self, other: Union["Dataset", "DataArray"], exclude: Iterable[Hashable] = None @@ -2416,7 +2364,10 @@ def interp( if kwargs is None: kwargs = {} coords = either_dict_or_kwargs(coords, coords_kwargs, "interp") - indexers = OrderedDict(self._validate_indexers(coords)) + indexers = OrderedDict( + (k, v.to_index_variable() if isinstance(v, Variable) and v.ndim == 1 else v) + for k, v in self._validate_indexers(coords) + ) obj = self if assume_sorted else self.sortby([k for k in coords]) @@ -2604,9 +2555,6 @@ def rename( name_dict : dict-like, optional Dictionary whose keys are current variable or dimension names and whose values are the desired names. - inplace : bool, optional - If True, rename variables and dimensions in-place. Otherwise, - return a new dataset object. **names, optional Keyword form of ``name_dict``. One of name_dict or names must be provided. @@ -2623,7 +2571,7 @@ def rename( Dataset.rename_dims DataArray.rename """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) name_dict = either_dict_or_kwargs(name_dict, names, "rename") for k in name_dict.keys(): if k not in self and k not in self.dims: @@ -2635,9 +2583,7 @@ def rename( variables, coord_names, dims, indexes = self._rename_all( name_dict=name_dict, dims_dict=name_dict ) - return self._replace( - variables, coord_names, dims=dims, indexes=indexes, inplace=inplace - ) + return self._replace(variables, coord_names, dims=dims, indexes=indexes) def rename_dims( self, dims_dict: Mapping[Hashable, Hashable] = None, **dims: Hashable @@ -2727,15 +2673,35 @@ def swap_dims( Dictionary whose keys are current dimension names and whose values are new names. Each value must already be a variable in the dataset. - inplace : bool, optional - If True, swap dimensions in-place. Otherwise, return a new dataset - object. Returns ------- - renamed : Dataset + swapped : Dataset Dataset with swapped dimensions. + Examples + -------- + >>> ds = xr.Dataset(data_vars={"a": ("x", [5, 7]), "b": ("x", [0.1, 2.4])}, + coords={"x": ["a", "b"], "y": ("x", [0, 1])}) + >>> ds + + Dimensions: (x: 2) + Coordinates: + * x (x) >> ds.swap_dims({"x": "y"}) + + Dimensions: (y: 2) + Coordinates: + x (y) "Dataset": - mapping {var name: (dimension name, array-like)} - mapping {var name: (tuple of dimension names, array-like)} - inplace : bool, optional - If True, merge the other dataset into this dataset in-place. - Otherwise, return a new dataset object. Returns ------- @@ -3398,12 +3346,10 @@ def update(self, other: "DatasetLike", inplace: bool = None) -> "Dataset": If any dimensions would have inconsistent sizes in the updated dataset. """ - inplace = _check_inplace(inplace, default=True) + _check_inplace(inplace) variables, coord_names, dims = dataset_update_method(self, other) - return self._replace_vars_and_dims( - variables, coord_names, dims, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names, dims, inplace=True) def merge( self, @@ -3425,9 +3371,6 @@ def merge( ---------- other : Dataset or castable to Dataset Dataset or variables to merge with this dataset. - inplace : bool, optional - If True, merge the other dataset into this dataset in-place. - Otherwise, return a new dataset object. overwrite_vars : Hashable or iterable of Hashable, optional If provided, update variables of these name(s) without checking for conflicts in this dataset. @@ -3464,7 +3407,7 @@ def merge( MergeError If any variables conflict (see ``compat``). """ - inplace = _check_inplace(inplace) + _check_inplace(inplace) variables, coord_names, dims = dataset_merge_method( self, other, @@ -3474,9 +3417,7 @@ def merge( fill_value=fill_value, ) - return self._replace_vars_and_dims( - variables, coord_names, dims, inplace=inplace - ) + return self._replace_vars_and_dims(variables, coord_names, dims) def _assert_all_in_dataset( self, names: Iterable[Hashable], virtual_okay: bool = False @@ -3554,9 +3495,23 @@ def drop( # noqa: F811 if errors not in ["raise", "ignore"]: raise ValueError('errors must be either "raise" or "ignore"') - labels_are_coords = isinstance(labels, DataArrayCoordinates) - if labels_kwargs or (utils.is_dict_like(labels) and not labels_are_coords): - labels_kwargs = utils.either_dict_or_kwargs(labels, labels_kwargs, "drop") + if is_dict_like(labels) and not isinstance(labels, dict): + warnings.warn( + "dropping coordinates using key values of dict-like labels is " + "deprecated; use drop_vars or a list of coordinates.", + FutureWarning, + stacklevel=2, + ) + if dim is not None and is_list_like(labels): + warnings.warn( + "dropping dimensions using list-like labels is deprecated; use " + "dict-like arguments.", + DeprecationWarning, + stacklevel=2, + ) + + if labels_kwargs or isinstance(labels, dict): + labels_kwargs = either_dict_or_kwargs(labels, labels_kwargs, "drop") if dim is not None: raise ValueError("cannot specify dim and dict-like arguments.") ds = self @@ -3570,13 +3525,6 @@ def drop( # noqa: F811 labels = set(labels) return self._drop_vars(labels, errors=errors) else: - if utils.is_list_like(labels): - warnings.warn( - "dropping dimensions using list-like labels is deprecated; " - "use dict-like arguments.", - DeprecationWarning, - stacklevel=2, - ) return self._drop_labels(labels, dim, errors=errors) def _drop_labels(self, labels=None, dim=None, errors="raise"): @@ -3951,9 +3899,7 @@ def reduce( Dataset with this object's DataArrays replaced with new DataArrays of summarized data and the indicated dimension(s) removed. """ - if dim is ALL_DIMS: - dim = None - if dim is None: + if dim is None or dim is ALL_DIMS: dims = set(self.dims) elif isinstance(dim, str) or not isinstance(dim, Iterable): dims = {dim} @@ -4155,8 +4101,61 @@ def to_dataframe(self): """ return self._to_dataframe(self.dims) + def _set_sparse_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + from sparse import COO + + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + try: + codes = idx.codes + except AttributeError: + # deprecated since pandas 0.24 + codes = idx.labels + coords = np.stack([np.asarray(code) for code in codes], axis=0) + is_sorted = idx.is_lexsorted + else: + coords = np.arange(idx.size).reshape(1, -1) + is_sorted = True + + for name, series in dataframe.items(): + # Cast to a NumPy array first, in case the Series is a pandas + # Extension array (which doesn't have a valid NumPy dtype) + values = np.asarray(series) + + # In virtually all real use cases, the sparse array will now have + # missing values and needs a fill_value. For consistency, don't + # special case the rare exceptions (e.g., dtype=int without a + # MultiIndex). + dtype, fill_value = dtypes.maybe_promote(values.dtype) + values = np.asarray(values, dtype=dtype) + + data = COO( + coords, + values, + shape, + has_duplicates=False, + sorted=is_sorted, + fill_value=fill_value, + ) + self[name] = (dims, data) + + def _set_numpy_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + # expand the DataFrame to include the product of all levels + full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) + dataframe = dataframe.reindex(full_idx) + + for name, series in dataframe.items(): + data = np.asarray(series).reshape(shape) + self[name] = (dims, data) + @classmethod - def from_dataframe(cls, dataframe): + def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset": """Convert a pandas.DataFrame into an xarray.Dataset Each column will be converted into an independent variable in the @@ -4165,7 +4164,24 @@ def from_dataframe(cls, dataframe): values with NaN). This method will produce a Dataset very similar to that on which the 'to_dataframe' method was called, except with possibly redundant dimensions (since all dataset variables will have - the same dimensionality). + the same dimensionality) + + Parameters + ---------- + dataframe : pandas.DataFrame + DataFrame from which to copy data and indices. + sparse : bool + If true, create a sparse arrays instead of dense numpy arrays. This + can potentially save a large amount of memory if the DataFrame has + a MultiIndex. Requires the sparse package (sparse.pydata.org). + + Returns + ------- + New Dataset. + + See also + -------- + xarray.DataArray.from_series """ # TODO: Add an option to remove dimensions along which the variables # are constant, to enable consistent serialization to/from a dataframe, @@ -4178,25 +4194,23 @@ def from_dataframe(cls, dataframe): obj = cls() if isinstance(idx, pd.MultiIndex): - # it's a multi-index - # expand the DataFrame to include the product of all levels - full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) - dataframe = dataframe.reindex(full_idx) - dims = [ + dims = tuple( name if name is not None else "level_%i" % n for n, name in enumerate(idx.names) - ] + ) for dim, lev in zip(dims, idx.levels): obj[dim] = (dim, lev) - shape = [lev.size for lev in idx.levels] + shape = tuple(lev.size for lev in idx.levels) else: - dims = (idx.name if idx.name is not None else "index",) - obj[dims[0]] = (dims, idx) - shape = -1 + index_name = idx.name if idx.name is not None else "index" + dims = (index_name,) + obj[index_name] = (dims, idx) + shape = (idx.size,) - for name, series in dataframe.items(): - data = np.asarray(series).reshape(shape) - obj[name] = (dims, data) + if sparse: + obj._set_sparse_data_from_dataframe(dataframe, dims, shape) + else: + obj._set_numpy_data_from_dataframe(dataframe, dims, shape) return obj def to_dask_dataframe(self, dim_order=None, set_index=False): @@ -4811,7 +4825,7 @@ def quantile( if isinstance(dim, str): dims = {dim} - elif dim is None: + elif dim is None or dim is ALL_DIMS: dims = set(self.dims) else: dims = set(dim) @@ -4839,7 +4853,10 @@ def quantile( # the former is often more efficient reduce_dims = None variables[name] = var.quantile( - q, dim=reduce_dims, interpolation=interpolation + q, + dim=reduce_dims, + interpolation=interpolation, + keep_attrs=keep_attrs, ) else: @@ -5139,7 +5156,7 @@ def filter_by_attrs(self, **kwargs): """ # noqa selection = [] - for var_name, variable in self.data_vars.items(): + for var_name, variable in self.variables.items(): has_value_flag = False for attr_name, pattern in kwargs.items(): attr_value = variable.attrs.get(attr_name) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 3d7e7cc64bc..fcd0400566f 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,7 +13,7 @@ from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast -from .pycompat import dask_array_type, sparse_array_type +from .pycompat import dask_array_type try: import dask.array as dask_array diff --git a/xarray/core/extensions.py b/xarray/core/extensions.py index 302a7fb2ec6..f473eaa497d 100644 --- a/xarray/core/extensions.py +++ b/xarray/core/extensions.py @@ -19,6 +19,14 @@ def __get__(self, obj, cls): if obj is None: # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor + + try: + return obj._accessors[self._name] + except TypeError: + obj._accessors = {} + except KeyError: + pass + try: accessor_obj = self._accessor(obj) except AttributeError: @@ -26,11 +34,8 @@ def __get__(self, obj, cls): # raised when initializing the accessor, so we need to raise as # something else (GH933): raise RuntimeError("error initializing %r accessor." % self._name) - # Replace the property with the accessor object. Inspired by: - # http://www.pydanny.com/cached-property.html - # We need to use object.__setattr__ because we overwrite __setattr__ on - # AttrAccessMixin. - object.__setattr__(obj, self._name, accessor_obj) + + obj._accessors[self._name] = accessor_obj return accessor_obj diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 51664fb3e32..c6b2537c958 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -96,7 +96,7 @@ def last_item(array): return [] indexer = (slice(-1, None),) * array.ndim - return np.ravel(array[indexer]).tolist() + return np.ravel(np.asarray(array[indexer])).tolist() def format_timestamp(t): diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 3ed3491b582..bae3057aabe 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -5,18 +5,18 @@ import numpy as np import pandas as pd -from . import dtypes, duck_array_ops, nputils, ops, utils +from . import dtypes, duck_array_ops, nputils, ops from .arithmetic import SupportsArithmetic +from .common import ImplementsArrayReduce, ImplementsDatasetReduce from .concat import concat -from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce from .options import _get_keep_attrs from .pycompat import integer_types from .utils import ( + either_dict_or_kwargs, hashable, maybe_wrap_array, peek_at, safe_cast_to_index, - either_dict_or_kwargs, ) from .variable import IndexVariable, Variable, as_variable @@ -139,13 +139,24 @@ class _DummyGroup: Should not be user visible. """ + __slots__ = ("name", "coords", "size") + def __init__(self, obj, name, coords): self.name = name self.coords = coords - self.dims = (name,) - self.ndim = 1 self.size = obj.sizes[name] - self.values = range(self.size) + + @property + def dims(self): + return (self.name,) + + @property + def ndim(self): + return 1 + + @property + def values(self): + return range(self.size) def _ensure_1d(group, obj): @@ -216,6 +227,19 @@ class GroupBy(SupportsArithmetic): DataArray.groupby """ + __slots__ = ( + "_full_index", + "_inserted_dims", + "_group", + "_group_dim", + "_group_indices", + "_groups", + "_obj", + "_restore_coord_dims", + "_stacked_dim", + "_unique_coord", + ) + def __init__( self, obj, @@ -676,19 +700,8 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, DataArray.quantile """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process - if self._obj.ndim > 1: - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) + if dim is None: + dim = self._group_dim out = self.apply( self._obj.__class__.quantile, @@ -734,20 +747,6 @@ def reduce( Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process - if self._obj.ndim > 1: - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) - if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) @@ -756,43 +755,6 @@ def reduce_array(ar): return self.apply(reduce_array, shortcut=shortcut) - # TODO remove the following class method and DEFAULT_DIMS after the - # deprecation cycle - @classmethod - def _reduce_method(cls, func, include_skipna, numeric_only): - if include_skipna: - - def wrapped_func( - self, - dim=DEFAULT_DIMS, - axis=None, - skipna=None, - keep_attrs=None, - **kwargs - ): - return self.reduce( - func, - dim, - axis, - keep_attrs=keep_attrs, - skipna=skipna, - allow_lazy=True, - **kwargs - ) - - else: - - def wrapped_func( # type: ignore - self, dim=DEFAULT_DIMS, axis=None, keep_attrs=None, **kwargs - ): - return self.reduce( - func, dim, axis, keep_attrs=keep_attrs, allow_lazy=True, **kwargs - ) - - return wrapped_func - - -DEFAULT_DIMS = utils.ReprObject("") ops.inject_reduce_methods(DataArrayGroupBy) ops.inject_binary_ops(DataArrayGroupBy) @@ -874,19 +836,7 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = ALL_DIMS - # TODO change this to dim = self._group_dim after - # the deprecation process. Do not forget to remove _reduce_method - warnings.warn( - "Default reduction dimension will be changed to the " - "grouped dimension in a future version of xarray. To " - "silence this warning, pass dim=xarray.ALL_DIMS " - "explicitly.", - FutureWarning, - stacklevel=2, - ) - elif dim is None: + if dim is None: dim = self._group_dim if keep_attrs is None: @@ -897,31 +847,6 @@ def reduce_dataset(ds): return self.apply(reduce_dataset) - # TODO remove the following class method and DEFAULT_DIMS after the - # deprecation cycle - @classmethod - def _reduce_method(cls, func, include_skipna, numeric_only): - if include_skipna: - - def wrapped_func(self, dim=DEFAULT_DIMS, skipna=None, **kwargs): - return self.reduce( - func, - dim, - skipna=skipna, - numeric_only=numeric_only, - allow_lazy=True, - **kwargs - ) - - else: - - def wrapped_func(self, dim=DEFAULT_DIMS, **kwargs): # type: ignore - return self.reduce( - func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs - ) - - return wrapped_func - def assign(self, **kwargs): """Assign data variables by group. diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 5917f7c7a2d..94188fabc92 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -11,6 +11,8 @@ class Indexes(collections.abc.Mapping): """Immutable proxy for Dataset or DataArrary indexes.""" + __slots__ = ("_indexes",) + def __init__(self, indexes): """Not for public consumption. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c5c3cadf7a2..c6a8f6f35e4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1,16 +1,17 @@ +import enum import functools import operator from collections import defaultdict from contextlib import suppress from datetime import timedelta -from typing import Any, Tuple, Sequence, Union +from typing import Any, Callable, Sequence, Tuple, Union import numpy as np import pandas as pd from . import duck_array_ops, nputils, utils from .npcompat import DTypeLike -from .pycompat import dask_array_type, integer_types +from .pycompat import dask_array_type, integer_types, sparse_array_type from .utils import is_dict_like, maybe_cast_to_coords_dtype @@ -327,6 +328,8 @@ class ExplicitIndexer: sub-classes BasicIndexer, OuterIndexer or VectorizedIndexer. """ + __slots__ = ("_key",) + def __init__(self, key): if type(self) is ExplicitIndexer: # noqa raise TypeError("cannot instantiate base ExplicitIndexer objects") @@ -359,6 +362,8 @@ class BasicIndexer(ExplicitIndexer): indexed with an integer are dropped from the result. """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -389,6 +394,8 @@ class OuterIndexer(ExplicitIndexer): indexing works like MATLAB/Fortran. """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -432,6 +439,8 @@ class VectorizedIndexer(ExplicitIndexer): https://github.com/numpy/numpy/pull/6256 """ + __slots__ = () + def __init__(self, key): if not isinstance(key, tuple): raise TypeError("key must be a tuple: {!r}".format(key)) @@ -468,10 +477,15 @@ def __init__(self, key): class ExplicitlyIndexed: - """Mixin to mark support for Indexer subclasses in indexing.""" + """Mixin to mark support for Indexer subclasses in indexing. + """ + + __slots__ = () class ExplicitlyIndexedNDArrayMixin(utils.NDArrayMixin, ExplicitlyIndexed): + __slots__ = () + def __array__(self, dtype=None): key = BasicIndexer((slice(None),) * self.ndim) return np.asarray(self[key], dtype=dtype) @@ -480,6 +494,8 @@ def __array__(self, dtype=None): class ImplicitToExplicitIndexingAdapter(utils.NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" + __slots__ = ("array", "indexer_cls") + def __init__(self, array, indexer_cls=BasicIndexer): self.array = as_indexable(array) self.indexer_cls = indexer_cls @@ -502,6 +518,8 @@ class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make basic and outer indexing lazy. """ + __slots__ = ("array", "key") + def __init__(self, array, key=None): """ Parameters @@ -577,6 +595,8 @@ class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make vectorized indexing lazy. """ + __slots__ = ("array", "key") + def __init__(self, array, key): """ Parameters @@ -631,6 +651,8 @@ def _wrap_numpy_scalars(array): class CopyOnWriteArray(ExplicitlyIndexedNDArrayMixin): + __slots__ = ("array", "_copied") + def __init__(self, array): self.array = as_indexable(array) self._copied = False @@ -655,6 +677,8 @@ def __setitem__(self, key, value): class MemoryCachedArray(ExplicitlyIndexedNDArrayMixin): + __slots__ = ("array",) + def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) @@ -783,18 +807,24 @@ def _combine_indexers(old_key, shape, new_key): ) -class IndexingSupport: # could inherit from enum.Enum on Python 3 +@enum.unique +class IndexingSupport(enum.Enum): # for backends that support only basic indexer - BASIC = "BASIC" + BASIC = 0 # for backends that support basic / outer indexer - OUTER = "OUTER" + OUTER = 1 # for backends that support outer indexer including at most 1 vector. - OUTER_1VECTOR = "OUTER_1VECTOR" + OUTER_1VECTOR = 2 # for backends that support full vectorized indexer. - VECTORIZED = "VECTORIZED" + VECTORIZED = 3 -def explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method): +def explicit_indexing_adapter( + key: ExplicitIndexer, + shape: Tuple[int, ...], + indexing_support: IndexingSupport, + raw_indexing_method: Callable, +) -> Any: """Support explicit indexing by delegating to a raw indexing method. Outer and/or vectorized indexers are supported by indexing a second time @@ -824,7 +854,9 @@ def explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method) return result -def decompose_indexer(indexer, shape, indexing_support): +def decompose_indexer( + indexer: ExplicitIndexer, shape: Tuple[int, ...], indexing_support: IndexingSupport +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: if isinstance(indexer, VectorizedIndexer): return _decompose_vectorized_indexer(indexer, shape, indexing_support) if isinstance(indexer, (BasicIndexer, OuterIndexer)): @@ -848,7 +880,11 @@ def _decompose_slice(key, size): return slice(start, stop, -step), slice(None, None, -1) -def _decompose_vectorized_indexer(indexer, shape, indexing_support): +def _decompose_vectorized_indexer( + indexer: VectorizedIndexer, + shape: Tuple[int, ...], + indexing_support: IndexingSupport, +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: """ Decompose vectorized indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -884,45 +920,49 @@ def _decompose_vectorized_indexer(indexer, shape, indexing_support): if indexing_support is IndexingSupport.VECTORIZED: return indexer, BasicIndexer(()) - backend_indexer = [] - np_indexer = [] + backend_indexer_elems = [] + np_indexer_elems = [] # convert negative indices - indexer = [ + indexer_elems = [ np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k for k, s in zip(indexer.tuple, shape) ] - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, slice): # If it is a slice, then we will slice it as-is # (but make its step positive) in the backend, # and then use all of it (slice(None)) for the in-memory portion. bk_slice, np_slice = _decompose_slice(k, s) - backend_indexer.append(bk_slice) - np_indexer.append(np_slice) + backend_indexer_elems.append(bk_slice) + np_indexer_elems.append(np_slice) else: # If it is a (multidimensional) np.ndarray, just pickup the used # keys without duplication and store them as a 1d-np.ndarray. oind, vind = np.unique(k, return_inverse=True) - backend_indexer.append(oind) - np_indexer.append(vind.reshape(*k.shape)) + backend_indexer_elems.append(oind) + np_indexer_elems.append(vind.reshape(*k.shape)) - backend_indexer = OuterIndexer(tuple(backend_indexer)) - np_indexer = VectorizedIndexer(tuple(np_indexer)) + backend_indexer = OuterIndexer(tuple(backend_indexer_elems)) + np_indexer = VectorizedIndexer(tuple(np_indexer_elems)) if indexing_support is IndexingSupport.OUTER: return backend_indexer, np_indexer # If the backend does not support outer indexing, # backend_indexer (OuterIndexer) is also decomposed. - backend_indexer, np_indexer1 = _decompose_outer_indexer( + backend_indexer1, np_indexer1 = _decompose_outer_indexer( backend_indexer, shape, indexing_support ) np_indexer = _combine_indexers(np_indexer1, shape, np_indexer) - return backend_indexer, np_indexer + return backend_indexer1, np_indexer -def _decompose_outer_indexer(indexer, shape, indexing_support): +def _decompose_outer_indexer( + indexer: Union[BasicIndexer, OuterIndexer], + shape: Tuple[int, ...], + indexing_support: IndexingSupport, +) -> Tuple[ExplicitIndexer, ExplicitIndexer]: """ Decompose outer indexer to the successive two indexers, where the first indexer will be used to index backend arrays, while the second one @@ -930,7 +970,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): Parameters ---------- - indexer: VectorizedIndexer + indexer: OuterIndexer or BasicIndexer indexing_support: One of the entries of IndexingSupport Returns @@ -968,7 +1008,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): pos_indexer.append(k + s) else: pos_indexer.append(k) - indexer = pos_indexer + indexer_elems = pos_indexer if indexing_support is IndexingSupport.OUTER_1VECTOR: # some backends such as h5py supports only 1 vector in indexers @@ -977,11 +1017,11 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): (np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) if isinstance(k, np.ndarray) else 0 - for k in indexer + for k in indexer_elems ] array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None - for i, (k, s) in enumerate(zip(indexer, shape)): + for i, (k, s) in enumerate(zip(indexer_elems, shape)): if isinstance(k, np.ndarray) and i != array_index: # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -1002,7 +1042,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): return (OuterIndexer(tuple(backend_indexer)), OuterIndexer(tuple(np_indexer))) if indexing_support == IndexingSupport.OUTER: - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, slice): # slice: convert positive step slice for backend bk_slice, np_slice = _decompose_slice(k, s) @@ -1024,7 +1064,7 @@ def _decompose_outer_indexer(indexer, shape, indexing_support): # basic indexer assert indexing_support == IndexingSupport.BASIC - for k, s in zip(indexer, shape): + for k, s in zip(indexer_elems, shape): if isinstance(k, np.ndarray): # np.ndarray key is converted to slice that covers the entire # entries of this key. @@ -1076,19 +1116,30 @@ def _logical_any(args): return functools.reduce(operator.or_, args) -def _masked_result_drop_slice(key, chunks_hint=None): +def _masked_result_drop_slice(key, data=None): + key = (k for k in key if not isinstance(k, slice)) - if chunks_hint is not None: - key = [ - _dask_array_with_chunks_hint(k, chunks_hint) - if isinstance(k, np.ndarray) - else k - for k in key - ] - return _logical_any(k == -1 for k in key) + chunks_hint = getattr(data, "chunks", None) + + new_keys = [] + for k in key: + if isinstance(k, np.ndarray): + if isinstance(data, dask_array_type): + new_keys.append(_dask_array_with_chunks_hint(k, chunks_hint)) + elif isinstance(data, sparse_array_type): + import sparse + + new_keys.append(sparse.COO.from_numpy(k)) + else: + new_keys.append(k) + else: + new_keys.append(k) + + mask = _logical_any(k == -1 for k in new_keys) + return mask -def create_mask(indexer, shape, chunks_hint=None): +def create_mask(indexer, shape, data=None): """Create a mask for indexing with a fill-value. Parameters @@ -1098,25 +1149,24 @@ def create_mask(indexer, shape, chunks_hint=None): the result that should be masked. shape : tuple Shape of the array being indexed. - chunks_hint : tuple, optional - Optional tuple indicating desired chunks for the result. If provided, - used as a hint for chunks on the resulting dask. Must have a hint for - each dimension on the result array. + data : optional + Data for which mask is being created. If data is a dask arrays, its chunks + are used as a hint for chunks on the resulting mask. If data is a sparse + array, the returned mask is also a sparse array. Returns ------- - mask : bool, np.ndarray or dask.array.Array with dtype=bool - Dask array if chunks_hint is provided, otherwise a NumPy array. Has the - same shape as the indexing result. + mask : bool, np.ndarray, SparseArray or dask.array.Array with dtype=bool + Same type as data. Has the same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): key = _outer_to_vectorized_indexer(indexer, shape).tuple assert not any(isinstance(k, slice) for k in key) - mask = _masked_result_drop_slice(key, chunks_hint) + mask = _masked_result_drop_slice(key, data) elif isinstance(indexer, VectorizedIndexer): key = indexer.tuple - base_mask = _masked_result_drop_slice(key, chunks_hint) + base_mask = _masked_result_drop_slice(key, data) slice_shape = tuple( np.arange(*k.indices(size)).size for k, size in zip(key, shape) @@ -1189,6 +1239,8 @@ def posify_mask_indexer(indexer): class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a NumPy array to use explicit indexing.""" + __slots__ = ("array",) + def __init__(self, array): # In NumpyIndexingAdapter we only allow to store bare np.ndarray if not isinstance(array, np.ndarray): @@ -1239,6 +1291,8 @@ def __setitem__(self, key, value): class NdArrayLikeIndexingAdapter(NumpyIndexingAdapter): + __slots__ = ("array",) + def __init__(self, array): if not hasattr(array, "__array_function__"): raise TypeError( @@ -1251,6 +1305,8 @@ def __init__(self, array): class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a dask array to support explicit indexing.""" + __slots__ = ("array",) + def __init__(self, array): """ This adapter is created in Variable.__getitem__ in Variable._broadcast_indexes. @@ -1292,6 +1348,8 @@ class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing. """ + __slots__ = ("array", "_dtype") + def __init__(self, array: Any, dtype: DTypeLike = None): self.array = utils.safe_cast_to_index(array) if dtype is None: diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 882667dbaaa..6dba659f992 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -1,5 +1,6 @@ from collections import OrderedDict from typing import ( + TYPE_CHECKING, Any, Dict, Hashable, @@ -11,7 +12,6 @@ Set, Tuple, Union, - TYPE_CHECKING, ) import pandas as pd @@ -44,6 +44,7 @@ "broadcast_equals": 2, "minimal": 3, "no_conflicts": 4, + "override": 5, } ) @@ -70,8 +71,8 @@ class MergeError(ValueError): # TODO: move this to an xarray.exceptions module? -def unique_variable(name, variables, compat="broadcast_equals"): - # type: (Any, List[Variable], str) -> Variable +def unique_variable(name, variables, compat="broadcast_equals", equals=None): + # type: (Any, List[Variable], str, bool) -> Variable """Return the unique variable from a list of variables or raise MergeError. Parameters @@ -81,8 +82,10 @@ def unique_variable(name, variables, compat="broadcast_equals"): variables : list of xarray.Variable List of Variable objects, all of which go by the same name in different inputs. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Type of equality check to use. + equals: None or bool, + corresponding to result of compat test Returns ------- @@ -93,30 +96,38 @@ def unique_variable(name, variables, compat="broadcast_equals"): MergeError: if any of the variables are not equal. """ # noqa out = variables[0] - if len(variables) > 1: - combine_method = None - if compat == "minimal": - compat = "broadcast_equals" + if len(variables) == 1 or compat == "override": + return out + + combine_method = None + + if compat == "minimal": + compat = "broadcast_equals" + + if compat == "broadcast_equals": + dim_lengths = broadcast_dimension_size(variables) + out = out.set_dims(dim_lengths) - if compat == "broadcast_equals": - dim_lengths = broadcast_dimension_size(variables) - out = out.set_dims(dim_lengths) + if compat == "no_conflicts": + combine_method = "fillna" - if compat == "no_conflicts": - combine_method = "fillna" + if equals is None: + out = out.compute() + for var in variables[1:]: + equals = getattr(out, compat)(var) + if not equals: + break + + if not equals: + raise MergeError( + "conflicting values for variable %r on objects to be combined. You can skip this check by specifying compat='override'." + % (name) + ) + if combine_method: for var in variables[1:]: - if not getattr(out, compat)(var): - raise MergeError( - "conflicting values for variable %r on " - "objects to be combined:\n" - "first value: %r\nsecond value: %r" % (name, out, var) - ) - if combine_method: - # TODO: add preservation of attrs into fillna - out = getattr(out, combine_method)(var) - out.attrs = var.attrs + out = getattr(out, combine_method)(var) return out @@ -152,7 +163,7 @@ def merge_variables( priority_vars : mapping with Variable or None values, optional If provided, variables are always taken from this dict in preference to the input variable dictionaries, without checking for conflicts. - compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts', 'override'}, optional Type of equality check to use when checking for conflicts. Returns @@ -449,7 +460,7 @@ def merge_core( ---------- objs : list of mappings All values must be convertable to labeled arrays. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Compatibility checks to use when merging variables. join : {'outer', 'inner', 'left', 'right'}, optional How to combine objects with different indexes. @@ -519,7 +530,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]] Merge together all variables from these objects. If any of them are DataArray objects, they must have a name. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -531,6 +542,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes in objects. diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index 9ba4eae29ae..17240faf007 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -88,38 +88,21 @@ def nanmax(a, axis=None, out=None): def nanargmin(a, axis=None): - fill_value = dtypes.get_pos_infinity(a.dtype) if a.dtype.kind == "O": + fill_value = dtypes.get_pos_infinity(a.dtype) return _nan_argminmax_object("argmin", fill_value, a, axis=axis) - a, mask = _replace_nan(a, fill_value) - if isinstance(a, dask_array_type): - res = dask_array.argmin(a, axis=axis) - else: - res = np.argmin(a, axis=axis) - if mask is not None: - mask = mask.all(axis=axis) - if mask.any(): - raise ValueError("All-NaN slice encountered") - return res + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanargmin(a, axis=axis) def nanargmax(a, axis=None): - fill_value = dtypes.get_neg_infinity(a.dtype) if a.dtype.kind == "O": + fill_value = dtypes.get_neg_infinity(a.dtype) return _nan_argminmax_object("argmax", fill_value, a, axis=axis) - a, mask = _replace_nan(a, fill_value) - if isinstance(a, dask_array_type): - res = dask_array.argmax(a, axis=axis) - else: - res = np.argmax(a, axis=axis) - - if mask is not None: - mask = mask.all(axis=axis) - if mask.any(): - raise ValueError("All-NaN slice encountered") - return res + module = dask_array if isinstance(a, dask_array_type) else nputils + return module.nanargmax(a, axis=axis) def nansum(a, axis=None, dtype=None, out=None, min_count=None): diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index ecaadae726e..22c14d9ff40 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -29,10 +29,10 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import builtins +import operator from distutils.version import LooseVersion from typing import Union -import operator import numpy as np try: diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index a9971e7125a..df36c98f94c 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -209,6 +209,7 @@ def f(values, axis=None, **kwargs): if ( _USE_BOTTLENECK + and isinstance(values, np.ndarray) and bn_func is not None and not isinstance(axis, tuple) and values.dtype.kind in "uifc" @@ -236,3 +237,5 @@ def f(values, axis=None, **kwargs): nanprod = _create_bottleneck_method("nanprod") nancumsum = _create_bottleneck_method("nancumsum") nancumprod = _create_bottleneck_method("nancumprod") +nanargmin = _create_bottleneck_method("nanargmin") +nanargmax = _create_bottleneck_method("nanargmax") diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 654a43b505e..91998482e3e 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -38,10 +38,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from distutils.version import LooseVersion + import numpy as np import pandas as pd - # allow ourselves to type checks for Panel even after it's removed if LooseVersion(pd.__version__) < "0.25.0": Panel = pd.Panel diff --git a/xarray/core/resample.py b/xarray/core/resample.py index de70ebb6950..1f2e5c0be43 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -1,5 +1,5 @@ from . import ops -from .groupby import DEFAULT_DIMS, DataArrayGroupBy, DatasetGroupBy +from .groupby import DataArrayGroupBy, DatasetGroupBy RESAMPLE_DIM = "__resample_dim__" @@ -307,9 +307,6 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): Array with summarized data and the indicated dimension(s) removed. """ - if dim == DEFAULT_DIMS: - dim = None - return super().reduce(func, dim, keep_attrs, **kwargs) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 592cae9007e..a812e7472ca 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -43,7 +43,8 @@ class Rolling: DataArray.rolling """ - _attributes = ["window", "min_periods", "center", "dim"] + __slots__ = ("obj", "window", "min_periods", "center", "dim") + _attributes = ("window", "min_periods", "center", "dim") def __init__(self, obj, windows, min_periods=None, center=False): """ @@ -93,17 +94,17 @@ def __init__(self, obj, windows, min_periods=None, center=False): # attributes self.window = window + if min_periods is not None and min_periods <= 0: + raise ValueError("min_periods must be greater than zero or None") self.min_periods = min_periods - if min_periods is None: - self._min_periods = window - else: - if min_periods <= 0: - raise ValueError("min_periods must be greater than zero or None") - self._min_periods = min_periods self.center = center self.dim = dim + @property + def _min_periods(self): + return self.min_periods if self.min_periods is not None else self.window + def __repr__(self): """provide a nice str repr of our rolling object""" @@ -152,6 +153,8 @@ def count(self): class DataArrayRolling(Rolling): + __slots__ = ("window_labels",) + def __init__(self, obj, windows, min_periods=None, center=False): """ Moving window object for DataArray. @@ -381,6 +384,8 @@ def _numpy_or_bottleneck_reduce( class DatasetRolling(Rolling): + __slots__ = ("rollings",) + def __init__(self, obj, windows, min_periods=None, center=False): """ Moving window object for Dataset. @@ -516,7 +521,8 @@ class Coarsen: DataArray.coarsen """ - _attributes = ["windows", "side", "trim_excess"] + __slots__ = ("obj", "boundary", "coord_func", "windows", "side", "trim_excess") + _attributes = ("windows", "side", "trim_excess") def __init__(self, obj, windows, boundary, side, coord_func): """ @@ -569,6 +575,8 @@ def __repr__(self): class DataArrayCoarsen(Coarsen): + __slots__ = () + @classmethod def _reduce_method(cls, func): """ @@ -599,6 +607,8 @@ def wrapped_func(self, **kwargs): class DatasetCoarsen(Coarsen): + __slots__ = () + @classmethod def _reduce_method(cls, func): """ diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ba478686d61..0d730edeaeb 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -29,27 +29,18 @@ import numpy as np import pandas as pd -from .pycompat import dask_array_type - - K = TypeVar("K") V = TypeVar("V") T = TypeVar("T") -def _check_inplace(inplace: Optional[bool], default: bool = False) -> bool: - if inplace is None: - inplace = default - else: - warnings.warn( - "The inplace argument has been deprecated and will be " - "removed in a future version of xarray.", - FutureWarning, - stacklevel=3, +def _check_inplace(inplace: Optional[bool]) -> None: + if inplace is not None: + raise TypeError( + "The `inplace` argument has been removed from xarray. " + "You can achieve an identical effect with python's standard assignment." ) - return inplace - def alias_message(old_name: str, new_name: str) -> str: return "%s has been deprecated. Use %s instead." % (old_name, new_name) @@ -276,16 +267,20 @@ def either_dict_or_kwargs( return cast(Mapping[Hashable, T], kw_kwargs) -def is_scalar(value: Any) -> bool: +def is_scalar(value: Any, include_0d: bool = True) -> bool: """Whether to treat a value as a scalar. Any non-iterable, string, or 0-D array """ + from .variable import NON_NUMPY_SUPPORTED_ARRAY_TYPES + + if include_0d: + include_0d = getattr(value, "ndim", None) == 0 return ( - getattr(value, "ndim", None) == 0 + include_0d or isinstance(value, (str, bytes)) or not ( - isinstance(value, (Iterable,) + dask_array_type) + isinstance(value, (Iterable,) + NON_NUMPY_SUPPORTED_ARRAY_TYPES) or hasattr(value, "__array_function__") ) ) @@ -381,7 +376,7 @@ class Frozen(Mapping[K, V]): saved under the `mapping` attribute. """ - __slots__ = ["mapping"] + __slots__ = ("mapping",) def __init__(self, mapping: Mapping[K, V]): self.mapping = mapping @@ -412,7 +407,7 @@ class SortedKeysDict(MutableMapping[K, V]): mapping. """ - __slots__ = ["mapping"] + __slots__ = ("mapping",) def __init__(self, mapping: MutableMapping[K, V] = None): self.mapping = {} if mapping is None else mapping @@ -446,6 +441,8 @@ class OrderedSet(MutableSet[T]): elements, like an OrderedDict. """ + __slots__ = ("_ordered_dict",) + def __init__(self, values: AbstractSet[T] = None): self._ordered_dict = OrderedDict() # type: MutableMapping[T, None] if values is not None: @@ -486,6 +483,8 @@ class NdimSizeLenMixin: one that also defines ``ndim``, ``size`` and ``__len__``. """ + __slots__ = () + @property def ndim(self: Any) -> int: return len(self.shape) @@ -510,6 +509,8 @@ class NDArrayMixin(NdimSizeLenMixin): `dtype`, `shape` and `__getitem__`. """ + __slots__ = () + @property def dtype(self: Any) -> np.dtype: return self.array.dtype @@ -623,6 +624,8 @@ class HiddenKeyDict(MutableMapping[K, V]): """Acts like a normal dictionary, but hides certain keys. """ + __slots__ = ("_data", "_hidden_keys") + # ``__init__`` method required to create instance from class. def __init__(self, data: MutableMapping[K, V], hidden_keys: Iterable[K]): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 4c095f3a062..b4b01f7ee49 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -3,7 +3,7 @@ from collections import OrderedDict, defaultdict from datetime import timedelta from distutils.version import LooseVersion -from typing import Any, Hashable, Mapping, MutableMapping, Union +from typing import Any, Hashable, Mapping, Union import numpy as np import pandas as pd @@ -18,9 +18,9 @@ VectorizedIndexer, as_indexable, ) +from .npcompat import IS_NEP18_ACTIVE from .options import _get_keep_attrs from .pycompat import dask_array_type, integer_types -from .npcompat import IS_NEP18_ACTIVE from .utils import ( OrderedSet, decode_numpy_dict_values, @@ -267,6 +267,8 @@ class Variable( they can use more complete metadata in context of coordinate labels. """ + __slots__ = ("_dims", "_data", "_attrs", "_encoding") + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): """ Parameters @@ -710,8 +712,7 @@ def _getitem_with_mask(self, key, fill_value=dtypes.NA): actual_indexer = indexer data = as_indexable(self._data)[actual_indexer] - chunks_hint = getattr(data, "chunks", None) - mask = indexing.create_mask(indexer, self.shape, chunks_hint) + mask = indexing.create_mask(indexer, self.shape, data) data = duck_array_ops.where(mask, fill_value, data) else: # array cannot be indexed along dimensions of size 0, so just @@ -1225,16 +1226,6 @@ def transpose(self, *dims) -> "Variable": def T(self) -> "Variable": return self.transpose() - def expand_dims(self, *args): - import warnings - - warnings.warn( - "Variable.expand_dims is deprecated: use " "Variable.set_dims instead", - DeprecationWarning, - stacklevel=2, - ) - return self.expand_dims(*args) - def set_dims(self, dims, shape=None): """Return a new variable with given set of dimensions. This method might be used to attach new dimension(s) to variable. @@ -1601,7 +1592,7 @@ def no_conflicts(self, other): """ return self.broadcast_equals(other, equiv=duck_array_ops.array_notnull_equiv) - def quantile(self, q, dim=None, interpolation="linear"): + def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): """Compute the qth quantile of the data along the specified dimension. Returns the qth quantiles(s) of the array elements. @@ -1624,6 +1615,10 @@ def quantile(self, q, dim=None, interpolation="linear"): * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. + keep_attrs : bool, optional + If True, the variable's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. Returns ------- @@ -1632,7 +1627,7 @@ def quantile(self, q, dim=None, interpolation="linear"): is a scalar. If multiple percentiles are given, first axis of the result corresponds to the quantile and a quantile dimension is added to the return array. The other dimensions are the - dimensions that remain after the reduction of the array. + dimensions that remain after the reduction of the array. See Also -------- @@ -1660,14 +1655,19 @@ def quantile(self, q, dim=None, interpolation="linear"): axis = None new_dims = [] - # only add the quantile dimension if q is array like + # Only add the quantile dimension if q is array-like if q.ndim != 0: new_dims = ["quantile"] + new_dims qs = np.nanpercentile( self.data, q * 100.0, axis=axis, interpolation=interpolation ) - return Variable(new_dims, qs) + + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + attrs = self._attrs if keep_attrs else None + + return Variable(new_dims, qs, attrs) def rank(self, dim, pct=False): """Ranks the data. @@ -1697,18 +1697,24 @@ def rank(self, dim, pct=False): """ import bottleneck as bn - if isinstance(self.data, dask_array_type): + data = self.data + + if isinstance(data, dask_array_type): raise TypeError( "rank does not work for arrays stored as dask " "arrays. Load the data via .compute() or .load() " "prior to calling this method." ) + elif not isinstance(data, np.ndarray): + raise TypeError( + "rank is not implemented for {} objects.".format(type(data)) + ) axis = self.get_axis_num(dim) func = bn.nanrankdata if self.dtype.kind == "f" else bn.rankdata - ranked = func(self.data, axis=axis) + ranked = func(data, axis=axis) if pct: - count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True) + count = np.sum(~np.isnan(data), axis=axis, keepdims=True) ranked /= count return Variable(self.dims, ranked) @@ -1931,6 +1937,8 @@ class IndexVariable(Variable): unless another name is given. """ + __slots__ = () + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): super().__init__(dims, data, attrs, encoding, fastpath) if self.ndim != 1: diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index 79f94077c8f..ec51ff26c07 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -67,7 +67,6 @@ class FacetGrid: Contains dictionaries mapping coordinate names to values. None is used as a sentinel value for axes which should remain empty, ie. sometimes the bottom right grid - """ def __init__( diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 14f03d42fe7..8ca62ef58f1 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -452,6 +452,8 @@ class _PlotMethods: For example, DataArray.plot.imshow """ + __slots__ = ("_da",) + def __init__(self, darray): self._da = darray diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 2d50734f519..f69a8af7a2f 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -731,17 +731,15 @@ def _process_cmap_cbar_kwargs( # colors is only valid when levels is supplied or the plot is of type # contour or contourf - if colors and (("contour" not in func.__name__) and (not levels)): + if colors and (("contour" not in func.__name__) and (levels is None)): raise ValueError("Can only specify colors with contour or levels") # we should not be getting a list of colors in cmap anymore # is there a better way to do this test? if isinstance(cmap, (list, tuple)): - warnings.warn( + raise ValueError( "Specifying a list of colors in cmap is deprecated. " - "Use colors keyword instead.", - DeprecationWarning, - stacklevel=3, + "Use colors keyword instead." ) cmap_kwargs = { diff --git a/xarray/testing.py b/xarray/testing.py index 3c92eef04c6..9fa58b64001 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -5,12 +5,11 @@ import numpy as np import pandas as pd -from xarray.core import duck_array_ops -from xarray.core import formatting +from xarray.core import duck_array_ops, formatting from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset -from xarray.core.variable import IndexVariable, Variable from xarray.core.indexes import default_indexes +from xarray.core.variable import IndexVariable, Variable def _decode_string_data(data): @@ -198,8 +197,6 @@ def _assert_dataarray_invariants(da: DataArray): if da._indexes is not None: _assert_indexes_invariants_checks(da._indexes, da._coords, da.dims) - assert da._initialized is True - def _assert_dataset_invariants(ds: Dataset): assert isinstance(ds._variables, OrderedDict), type(ds._variables) @@ -236,7 +233,6 @@ def _assert_dataset_invariants(ds: Dataset): assert isinstance(ds._encoding, (type(None), dict)) assert isinstance(ds._attrs, (type(None), OrderedDict)) - assert ds._initialized is True def _assert_internal_invariants(xarray_obj: Union[DataArray, Dataset, Variable],): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 044ba75e87f..ab1d2714b9d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -1,4 +1,5 @@ import importlib +import platform import re import warnings from contextlib import contextmanager @@ -32,7 +33,6 @@ except ImportError: pass -import platform arm_xfail = pytest.mark.xfail( platform.machine() == "aarch64" or "arm" in platform.machine(), @@ -84,6 +84,7 @@ def LooseVersion(vstring): has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") +has_sparse, requires_sparse = _importorskip("sparse") # some special cases has_h5netcdf07, requires_h5netcdf07 = _importorskip("h5netcdf", minversion="0.7") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dd102f8e2e1..f6254b32f4f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -38,6 +38,7 @@ from xarray.tests import mock from . import ( + arm_xfail, assert_allclose, assert_array_equal, assert_equal, @@ -61,14 +62,13 @@ requires_scipy, requires_scipy_or_netCDF4, requires_zarr, - arm_xfail, ) from .test_coding_times import ( _ALL_CALENDARS, _NON_STANDARD_CALENDARS, _STANDARD_CALENDARS, ) -from .test_dataset import create_test_data, create_append_test_data +from .test_dataset import create_append_test_data, create_test_data try: import netCDF4 as nc4 @@ -2163,6 +2163,7 @@ def test_encoding_unlimited_dims(self): @requires_h5netcdf @requires_netCDF4 +@pytest.mark.filterwarnings("ignore:use make_scale(name) instead") class TestH5NetCDFData(NetCDF4Base): engine = "h5netcdf" @@ -2173,16 +2174,25 @@ def create_store(self): @pytest.mark.filterwarnings("ignore:complex dtypes are supported by h5py") @pytest.mark.parametrize( - "invalid_netcdf, warns, num_warns", + "invalid_netcdf, warntype, num_warns", [(None, FutureWarning, 1), (False, FutureWarning, 1), (True, None, 0)], ) - def test_complex(self, invalid_netcdf, warns, num_warns): + def test_complex(self, invalid_netcdf, warntype, num_warns): expected = Dataset({"x": ("y", np.ones(5) + 1j * np.ones(5))}) save_kwargs = {"invalid_netcdf": invalid_netcdf} - with pytest.warns(warns) as record: + with pytest.warns(warntype) as record: with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: assert_equal(expected, actual) - assert len(record) == num_warns + + recorded_num_warns = 0 + if warntype: + for warning in record: + if issubclass(warning.category, warntype) and ( + "complex dtypes" in str(warning.message) + ): + recorded_num_warns += 1 + + assert recorded_num_warns == num_warns def test_cross_engine_read_write_netcdf4(self): # Drop dim3, because its labels include strings. These appear to be @@ -2451,6 +2461,7 @@ def skip_if_not_engine(engine): @requires_dask +@pytest.mark.filterwarnings("ignore:use make_scale(name) instead") def test_open_mfdataset_manyfiles( readengine, nfiles, parallel, chunks, file_cache_maxsize ): @@ -3923,6 +3934,12 @@ def test_ENVI_tags(self): assert isinstance(rioda.attrs["map_info"], str) assert isinstance(rioda.attrs["samples"], str) + def test_geotiff_tags(self): + # Create a geotiff file with some tags + with create_tmp_geotiff() as (tmp_file, _): + with xr.open_rasterio(tmp_file) as rioda: + assert isinstance(rioda.attrs["AREA_OR_POINT"], str) + def test_no_mftime(self): # rasterio can accept "filename" urguments that are actually urls, # including paths to remote files. diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index ab5ed20d531..615a7e00172 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -19,13 +19,13 @@ from xarray.testing import assert_equal from . import ( + arm_xfail, assert_array_equal, has_cftime, has_cftime_or_netCDF4, has_dask, requires_cftime, requires_cftime_or_netCDF4, - arm_xfail, ) try: diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index e3801d02bc8..6037669ac07 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -1,23 +1,29 @@ from collections import OrderedDict -from itertools import product from datetime import datetime +from itertools import product import numpy as np import pytest -from xarray import DataArray, Dataset, concat, combine_by_coords, combine_nested -from xarray import auto_combine +from xarray import ( + DataArray, + Dataset, + auto_combine, + combine_by_coords, + combine_nested, + concat, +) from xarray.core import dtypes from xarray.core.combine import ( - _new_tile_id, _check_shape_tile_ids, _combine_all_along_first_dim, _combine_nd, - _infer_concat_order_from_positions, _infer_concat_order_from_coords, + _infer_concat_order_from_positions, + _new_tile_id, ) -from . import assert_identical, assert_equal, raises_regex +from . import assert_equal, assert_identical, raises_regex from .test_dataset import create_test_data @@ -321,13 +327,13 @@ class TestCheckShapeTileIDs: def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent depths"): + with raises_regex(ValueError, "sub-lists do not have consistent depths"): _check_shape_tile_ids(combined_tile_ids) def test_check_lengths(self): ds = create_test_data(0) combined_tile_ids = {(0, 0): ds, (0, 1): ds, (0, 2): ds, (1, 0): ds, (1, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent lengths"): + with raises_regex(ValueError, "sub-lists do not have consistent lengths"): _check_shape_tile_ids(combined_tile_ids) @@ -559,11 +565,6 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({"x": [0]}) assert_identical(expected, actual) - def test_combine_nested_but_need_auto_combine(self): - objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2], "wall": [0]})] - with raises_regex(ValueError, "cannot be combined"): - combine_nested(objs, concat_dim="x") - @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_combine_nested_fill_value(self, fill_value): datasets = [ @@ -612,7 +613,7 @@ def test_combine_by_coords(self): assert_equal(actual, expected) objs = [Dataset({"x": 0}), Dataset({"x": 1})] - with raises_regex(ValueError, "Could not find any dimension " "coordinates"): + with raises_regex(ValueError, "Could not find any dimension coordinates"): combine_by_coords(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] @@ -713,7 +714,7 @@ def test_check_for_impossible_ordering(self): @pytest.mark.filterwarnings( - "ignore:In xarray version 0.13 `auto_combine` " "will be deprecated" + "ignore:In xarray version 0.14 `auto_combine` " "will be deprecated" ) @pytest.mark.filterwarnings("ignore:Also `open_mfdataset` will no longer") @pytest.mark.filterwarnings("ignore:The datasets supplied") @@ -755,7 +756,7 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with pytest.raises(KeyError): + with raises_regex(ValueError, "'y' is not present in all datasets"): auto_combine(objs) def test_auto_combine_previously_failed(self): diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 4adcc0d5c49..00428f70966 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -5,7 +5,7 @@ import pytest from xarray import DataArray, Dataset, Variable, concat -from xarray.core import dtypes +from xarray.core import dtypes, merge from . import ( InaccessibleArray, assert_array_equal, @@ -17,6 +17,34 @@ from .test_dataset import create_test_data +def test_concat_compat(): + ds1 = Dataset( + { + "has_x_y": (("y", "x"), [[1, 2]]), + "has_x": ("x", [1, 2]), + "no_x_y": ("z", [1, 2]), + }, + coords={"x": [0, 1], "y": [0], "z": [-1, -2]}, + ) + ds2 = Dataset( + { + "has_x_y": (("y", "x"), [[3, 4]]), + "has_x": ("x", [1, 2]), + "no_x_y": (("q", "z"), [[1, 2]]), + }, + coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, + ) + + result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") + assert_equal(ds2.no_x_y, result.no_x_y.transpose()) + + for var in ["has_x", "no_x_y"]: + assert "y" not in result[var] + + with raises_regex(ValueError, "'q' is not present in all datasets"): + concat([ds1, ds2], dim="q", data_vars="all", compat="broadcast_equals") + + class TestConcatDataset: @pytest.fixture def data(self): @@ -91,7 +119,7 @@ def test_concat_coords(self): actual = concat(objs, dim="x", coords=coords) assert_identical(expected, actual) for coords in ["minimal", []]: - with raises_regex(ValueError, "not equal across"): + with raises_regex(merge.MergeError, "conflicting values"): concat(objs, dim="x", coords=coords) def test_concat_constant_index(self): @@ -102,8 +130,10 @@ def test_concat_constant_index(self): for mode in ["different", "all", ["foo"]]: actual = concat([ds1, ds2], "y", data_vars=mode) assert_identical(expected, actual) - with raises_regex(ValueError, "not equal across datasets"): - concat([ds1, ds2], "y", data_vars="minimal") + with raises_regex(merge.MergeError, "conflicting values"): + # previously dim="y", and raised error which makes no sense. + # "foo" has dimension "y" so minimal should concatenate it? + concat([ds1, ds2], "new_dim", data_vars="minimal") def test_concat_size0(self): data = create_test_data() @@ -133,6 +163,14 @@ def test_concat_errors(self): data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] + with raises_regex(ValueError, "must supply at least one"): + concat([], "dim1") + + with raises_regex(ValueError, "Cannot specify both .*='different'"): + concat( + [data, data], dim="concat_dim", data_vars="different", compat="override" + ) + with raises_regex(ValueError, "must supply at least one"): concat([], "dim1") @@ -145,7 +183,7 @@ def test_concat_errors(self): concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) - with raises_regex(ValueError, "encountered unexpected"): + with raises_regex(ValueError, "present in some datasets"): data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) concat([data0, data1], "dim1") @@ -162,11 +200,6 @@ def test_concat_errors(self): with raises_regex(ValueError, "coordinate in some datasets but not others"): concat([Dataset({"x": 0}), Dataset({}, {"x": 1})], dim="z") - with raises_regex(ValueError, "no longer a valid"): - concat([data, data], "new_dim", mode="different") - with raises_regex(ValueError, "no longer a valid"): - concat([data, data], "new_dim", concat_over="different") - def test_concat_join_kwarg(self): ds1 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [0], "y": [0]}) ds2 = Dataset({"a": (("x", "y"), [[0]])}, coords={"x": [1], "y": [0.0001]}) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 36c1d845f8e..5d80abb4661 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -278,6 +278,26 @@ def test_decode_cf_with_dask(self): ) assert_identical(decoded, conventions.decode_cf(original).compute()) + @requires_dask + def test_decode_dask_times(self): + original = Dataset.from_dict( + { + "coords": {}, + "dims": {"time": 5}, + "data_vars": { + "average_T1": { + "dims": ("time",), + "attrs": {"units": "days since 1958-01-01 00:00:00"}, + "data": [87659.0, 88024.0, 88389.0, 88754.0, 89119.0], + } + }, + } + ) + assert_identical( + conventions.decode_cf(original.chunk()), + conventions.decode_cf(original).chunk(), + ) + class CFEncodedInMemoryStore(WritableCFDataStore, InMemoryDataStore): def encode_variable(self, var): diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index e3fc6f65e0f..76b3ed1a8d6 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -27,14 +27,49 @@ dd = pytest.importorskip("dask.dataframe") +class CountingScheduler: + """ Simple dask scheduler counting the number of computes. + + Reference: https://stackoverflow.com/questions/53289286/ """ + + def __init__(self, max_computes=0): + self.total_computes = 0 + self.max_computes = max_computes + + def __call__(self, dsk, keys, **kwargs): + self.total_computes += 1 + if self.total_computes > self.max_computes: + raise RuntimeError( + "Too many computes. Total: %d > max: %d." + % (self.total_computes, self.max_computes) + ) + return dask.get(dsk, keys, **kwargs) + + +def _set_dask_scheduler(scheduler=dask.get): + """ Backwards compatible way of setting scheduler. """ + if LooseVersion(dask.__version__) >= LooseVersion("0.18.0"): + return dask.config.set(scheduler=scheduler) + return dask.set_options(get=scheduler) + + +def raise_if_dask_computes(max_computes=0): + scheduler = CountingScheduler(max_computes) + return _set_dask_scheduler(scheduler) + + +def test_raise_if_dask_computes(): + data = da.from_array(np.random.RandomState(0).randn(4, 6), chunks=(2, 2)) + with raises_regex(RuntimeError, "Too many computes"): + with raise_if_dask_computes(): + data.compute() + + class DaskTestCase: def assertLazyAnd(self, expected, actual, test): - - with ( - dask.config.set(scheduler="single-threaded") - if LooseVersion(dask.__version__) >= LooseVersion("0.18.0") - else dask.set_options(get=dask.get) - ): + with _set_dask_scheduler(dask.get): + # dask.get is the syncronous scheduler, which get's set also by + # dask.config.set(scheduler="syncronous") in current versions. test(actual, expected) if isinstance(actual, Dataset): @@ -174,7 +209,12 @@ def test_reduce(self): v = self.lazy_var self.assertLazyAndAllClose(u.mean(), v.mean()) self.assertLazyAndAllClose(u.std(), v.std()) - self.assertLazyAndAllClose(u.argmax(dim="x"), v.argmax(dim="x")) + with raise_if_dask_computes(): + actual = v.argmax(dim="x") + self.assertLazyAndAllClose(u.argmax(dim="x"), actual) + with raise_if_dask_computes(): + actual = v.argmin(dim="x") + self.assertLazyAndAllClose(u.argmin(dim="x"), actual) self.assertLazyAndAllClose((u > 1).any(), (v > 1).any()) self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) with raises_regex(NotImplementedError, "dask"): @@ -785,7 +825,6 @@ def kernel(name): """Dask kernel to test pickling/unpickling and __repr__. Must be global to make it pickleable. """ - print("kernel(%s)" % name) global kernel_call_count kernel_call_count += 1 return np.ones(1, dtype=np.int64) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 506c437c2bf..9ba3eecc5a0 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -24,12 +24,12 @@ assert_identical, raises_regex, requires_bottleneck, - requires_cftime, requires_dask, requires_iris, requires_np113, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -1002,63 +1002,53 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_isel_points(self): - shape = (10, 5, 6) - np_array = np.random.random(shape) - da = DataArray( - np_array, dims=["time", "y", "x"], coords={"time": np.arange(0, 100, 10)} - ) - y = [1, 3] - x = [3, 0] - - expected = da.values[:, y, x] - - actual = da.isel_points(y=y, x=x, dim="test_coord") - assert actual.coords["test_coord"].shape == (len(y),) - assert list(actual.coords) == ["time"] - assert actual.dims == ("test_coord", "time") - - actual = da.isel_points(y=y, x=x) - assert "points" in actual.dims - # Note that because xarray always concatenates along the first - # dimension, We must transpose the result to match the numpy style of - # concatenation. - np.testing.assert_equal(actual.T, expected) - - # a few corner cases - da.isel_points(time=[1, 2], x=[2, 2], y=[3, 4]) - np.testing.assert_allclose( - da.isel_points(time=[1], x=[2], y=[4]).values.squeeze(), - np_array[1, 4, 2].squeeze(), - ) - da.isel_points(time=[1, 2]) - y = [-1, 0] - x = [-2, 2] - expected = da.values[:, y, x] - actual = da.isel_points(x=x, y=y).values - np.testing.assert_equal(actual.T, expected) - - # test that the order of the indexers doesn't matter - assert_identical(da.isel_points(y=y, x=x), da.isel_points(x=x, y=y)) - - # make sure we're raising errors in the right places - with raises_regex(ValueError, "All indexers must be the same length"): - da.isel_points(y=[1, 2], x=[1, 2, 3]) - with raises_regex(ValueError, "dimension bad_key does not exist"): - da.isel_points(bad_key=[1, 2]) - with raises_regex(TypeError, "Indexers must be integers"): - da.isel_points(y=[1.5, 2.2]) - with raises_regex(TypeError, "Indexers must be integers"): - da.isel_points(x=[1, 2, 3], y=slice(3)) - with raises_regex(ValueError, "Indexers must be 1 dimensional"): - da.isel_points(y=1, x=2) - with raises_regex(ValueError, "Existing dimension names are not"): - da.isel_points(y=[1, 2], x=[1, 2], dim="x") - - # using non string dims - actual = da.isel_points(y=[1, 2], x=[1, 2], dim=["A", "B"]) - assert "points" in actual.coords + def test_head(self): + assert_equal(self.dv.isel(x=slice(5)), self.dv.head(x=5)) + assert_equal(self.dv.isel(x=slice(0)), self.dv.head(x=0)) + assert_equal( + self.dv.isel({dim: slice(6) for dim in self.dv.dims}), self.dv.head(6) + ) + assert_equal( + self.dv.isel({dim: slice(5) for dim in self.dv.dims}), self.dv.head() + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.head([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.head(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.head(-3) + + def test_tail(self): + assert_equal(self.dv.isel(x=slice(-5, None)), self.dv.tail(x=5)) + assert_equal(self.dv.isel(x=slice(0)), self.dv.tail(x=0)) + assert_equal( + self.dv.isel({dim: slice(-6, None) for dim in self.dv.dims}), + self.dv.tail(6), + ) + assert_equal( + self.dv.isel({dim: slice(-5, None) for dim in self.dv.dims}), self.dv.tail() + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.tail([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.tail(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.tail(-3) + + def test_thin(self): + assert_equal(self.dv.isel(x=slice(None, None, 5)), self.dv.thin(x=5)) + assert_equal( + self.dv.isel({dim: slice(None, None, 6) for dim in self.dv.dims}), + self.dv.thin(6), + ) + with raises_regex(TypeError, "either dict-like or a single int"): + self.dv.thin([3]) + with raises_regex(TypeError, "expected integer type"): + self.dv.thin(x=3.1) + with raises_regex(ValueError, "expected positive int"): + self.dv.thin(-3) + with raises_regex(ValueError, "cannot be zero"): + self.dv.thin(time=0) def test_loc(self): self.ds["x"] = ("x", np.array(list("abcdefghij"))) @@ -1350,9 +1340,8 @@ def test_reset_coords(self): ) assert_identical(actual, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): - with raises_regex(ValueError, "cannot reset coord"): - data = data.reset_coords(inplace=True) + with pytest.raises(TypeError): + data = data.reset_coords(inplace=True) with raises_regex(ValueError, "cannot be found"): data.reset_coords("foo", drop=True) with raises_regex(ValueError, "cannot be found"): @@ -1454,13 +1443,11 @@ def test_reindex_like_no_index(self): with raises_regex(ValueError, "different size for unlabeled"): foo.reindex_like(bar) - @pytest.mark.filterwarnings("ignore:Indexer has dimensions") def test_reindex_regressions(self): - # regression test for #279 - expected = DataArray(np.random.randn(5), coords=[("time", range(5))]) + da = DataArray(np.random.randn(5), coords=[("time", range(5))]) time2 = DataArray(np.arange(5), dims="time2") - actual = expected.reindex(time=time2) - assert_identical(actual, expected) + with pytest.raises(ValueError): + da.reindex(time=time2) # regression test for #736, reindex can not change complex nums dtype x = np.array([1, 2, 3], dtype=np.complex) @@ -1506,6 +1493,32 @@ def test_rename(self): renamed_kwargs = self.dv.x.rename(x="z").rename("z") assert_identical(renamed, renamed_kwargs) + def test_init_value(self): + expected = DataArray( + np.full((3, 4), 3), dims=["x", "y"], coords=[range(3), range(4)] + ) + actual = DataArray(3, dims=["x", "y"], coords=[range(3), range(4)]) + assert_identical(expected, actual) + + expected = DataArray( + np.full((1, 10, 2), 0), + dims=["w", "x", "y"], + coords={"x": np.arange(10), "y": ["north", "south"]}, + ) + actual = DataArray(0, dims=expected.dims, coords=expected.coords) + assert_identical(expected, actual) + + expected = DataArray( + np.full((10, 2), np.nan), coords=[("x", np.arange(10)), ("y", ["a", "b"])] + ) + actual = DataArray(coords=[("x", np.arange(10)), ("y", ["a", "b"])]) + assert_identical(expected, actual) + + with raises_regex(ValueError, "different number of dim"): + DataArray(np.array(1), coords={"x": np.arange(10)}, dims=["x"]) + with raises_regex(ValueError, "does not match the 0 dim"): + DataArray(np.array(1), coords=[("x", np.arange(10))]) + def test_swap_dims(self): array = DataArray(np.random.randn(3), {"y": ("x", list("abc"))}, "x") expected = DataArray(array.values, {"y": list("abc")}, dims="y") @@ -1761,10 +1774,9 @@ def test_reorder_levels(self): obj = self.mda.reorder_levels(x=["level_2", "level_1"]) assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): array = self.mda.copy() array.reorder_levels(x=["level_2", "level_1"], inplace=True) - assert_identical(array, expected) array = DataArray([1, 2], dims="x") with pytest.raises(KeyError): @@ -2321,17 +2333,17 @@ def test_reduce_out(self): with pytest.raises(TypeError): orig.mean(out=np.ones(orig.shape)) - # skip due to bug in older versions of numpy.nanpercentile def test_quantile(self): for q in [0.25, [0.50], [0.25, 0.75]]: for axis, dim in zip( [None, 0, [0], [0, 1]], [None, "x", ["x"], ["x", "y"]] ): - actual = self.dv.quantile(q, dim=dim) + actual = DataArray(self.va).quantile(q, dim=dim, keep_attrs=True) expected = np.nanpercentile( self.dv.values, np.array(q) * 100, axis=axis ) np.testing.assert_allclose(actual.values, expected) + assert actual.attrs == self.attrs def test_reduce_keep_attrs(self): # Test dropped attrs @@ -2487,16 +2499,6 @@ def test_groupby_sum(self): assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y")) assert_allclose(expected_sum_axis1, grouped.sum("y")) - def test_groupby_warning(self): - array = self.make_groupby_example_array() - grouped = array.groupby("y") - with pytest.warns(FutureWarning): - grouped.sum() - - @pytest.mark.skipif( - LooseVersion(xr.__version__) < LooseVersion("0.13"), - reason="not to forget the behavior change", - ) def test_groupby_sum_default(self): array = self.make_groupby_example_array() grouped = array.groupby("abc") @@ -2517,7 +2519,7 @@ def test_groupby_sum_default(self): } )["foo"] - assert_allclose(expected_sum_all, grouped.sum()) + assert_allclose(expected_sum_all, grouped.sum(dim="y")) def test_groupby_count(self): array = DataArray( @@ -3433,6 +3435,19 @@ def test_to_and_from_series(self): expected_da = self.dv.rename(None) assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"])) + @requires_sparse + def test_from_series_sparse(self): + import sparse + + series = pd.Series([1, 2], index=[("a", 1), ("b", 2)]) + + actual_sparse = DataArray.from_series(series, sparse=True) + actual_dense = DataArray.from_series(series, sparse=False) + + assert isinstance(actual_sparse.data, sparse.COO) + actual_sparse.data = actual_sparse.data.todense() + assert_identical(actual_sparse, actual_dense) + def test_to_and_from_empty_series(self): # GH697 expected = pd.Series([]) @@ -3693,10 +3708,8 @@ def test_to_dataset_whole(self): expected = Dataset({"foo": ("x", [1, 2])}) assert_identical(expected, actual) - expected = Dataset({"bar": ("x", [1, 2])}) - with pytest.warns(FutureWarning): + with pytest.raises(TypeError): actual = named.to_dataset("bar") - assert_identical(expected, actual) def test_to_dataset_split(self): array = DataArray([1, 2, 3], coords=[("x", list("abc"))], attrs={"a": 1}) @@ -4637,3 +4650,36 @@ def test_rolling_exp(da, dim, window_type, window): ) assert_allclose(expected.variable, result.variable) + + +def test_no_dict(): + d = DataArray() + with pytest.raises(AttributeError): + d.__dict__ + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_subclass_slots(): + """Test that DataArray subclasses must explicitly define ``__slots__``. + + .. note:: + As of 0.13.0, this is actually mitigated into a FutureWarning for any class + defined outside of the xarray package. + """ + with pytest.raises(AttributeError) as e: + + class MyArray(DataArray): + pass + + assert str(e.value) == "MyArray must explicitly define __slots__" + + +def test_weakref(): + """Classes with __slots__ are incompatible with the weakref module unless they + explicitly state __weakref__ among their slots + """ + from weakref import ref + + a = DataArray(1) + r = ref(a) + assert r() is a diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 55358e47e41..f02990a1be9 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,8 +31,8 @@ from xarray.core.pycompat import integer_types from . import ( - LooseVersion, InaccessibleArray, + LooseVersion, UnexpectedDataAccess, assert_allclose, assert_array_equal, @@ -46,6 +46,7 @@ requires_dask, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -1410,115 +1411,77 @@ def test_isel_drop(self): selected = data.isel(x=0, drop=False) assert_identical(expected, selected) - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_isel_points(self): + def test_head(self): data = create_test_data() - pdim1 = [1, 2, 3] - pdim2 = [4, 5, 1] - pdim3 = [1, 2, 3] - actual = data.isel_points(dim1=pdim1, dim2=pdim2, dim3=pdim3, dim="test_coord") - assert "test_coord" in actual.dims - assert actual.coords["test_coord"].shape == (len(pdim1),) - - actual = data.isel_points(dim1=pdim1, dim2=pdim2) - assert "points" in actual.dims - assert "dim3" in actual.dims - assert "dim3" not in actual.data_vars - np.testing.assert_array_equal(data["dim2"][pdim2], actual["dim2"]) + expected = data.isel(time=slice(5), dim2=slice(6)) + actual = data.head(time=5, dim2=6) + assert_equal(expected, actual) - # test that the order of the indexers doesn't matter - assert_identical( - data.isel_points(dim1=pdim1, dim2=pdim2), - data.isel_points(dim2=pdim2, dim1=pdim1), - ) + expected = data.isel(time=slice(0)) + actual = data.head(time=0) + assert_equal(expected, actual) - # make sure we're raising errors in the right places - with raises_regex(ValueError, "All indexers must be the same length"): - data.isel_points(dim1=[1, 2], dim2=[1, 2, 3]) - with raises_regex(ValueError, "dimension bad_key does not exist"): - data.isel_points(bad_key=[1, 2]) - with raises_regex(TypeError, "Indexers must be integers"): - data.isel_points(dim1=[1.5, 2.2]) - with raises_regex(TypeError, "Indexers must be integers"): - data.isel_points(dim1=[1, 2, 3], dim2=slice(3)) - with raises_regex(ValueError, "Indexers must be 1 dimensional"): - data.isel_points(dim1=1, dim2=2) - with raises_regex(ValueError, "Existing dimension names are not valid"): - data.isel_points(dim1=[1, 2], dim2=[1, 2], dim="dim2") + expected = data.isel({dim: slice(6) for dim in data.dims}) + actual = data.head(6) + assert_equal(expected, actual) - # test to be sure we keep around variables that were not indexed - ds = Dataset({"x": [1, 2, 3, 4], "y": 0}) - actual = ds.isel_points(x=[0, 1, 2]) - assert_identical(ds["y"], actual["y"]) + expected = data.isel({dim: slice(5) for dim in data.dims}) + actual = data.head() + assert_equal(expected, actual) - # tests using index or DataArray as a dim - stations = Dataset() - stations["station"] = ("station", ["A", "B", "C"]) - stations["dim1s"] = ("station", [1, 2, 3]) - stations["dim2s"] = ("station", [4, 5, 1]) + with raises_regex(TypeError, "either dict-like or a single int"): + data.head([3]) + with raises_regex(TypeError, "expected integer type"): + data.head(dim2=3.1) + with raises_regex(ValueError, "expected positive int"): + data.head(time=-3) - actual = data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=stations["station"] - ) - assert "station" in actual.coords - assert "station" in actual.dims - assert_identical(actual["station"].drop(["dim2"]), stations["station"]) + def test_tail(self): + data = create_test_data() - # make sure we get the default 'points' coordinate when passed a list - actual = data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=["A", "B", "C"] - ) - assert "points" in actual.coords - assert actual.coords["points"].values.tolist() == ["A", "B", "C"] + expected = data.isel(time=slice(-5, None), dim2=slice(-6, None)) + actual = data.tail(time=5, dim2=6) + assert_equal(expected, actual) - # test index - actual = data.isel_points( - dim1=stations["dim1s"].values, - dim2=stations["dim2s"].values, - dim=pd.Index(["A", "B", "C"], name="letters"), - ) - assert "letters" in actual.coords + expected = data.isel(dim1=slice(0)) + actual = data.tail(dim1=0) + assert_equal(expected, actual) - # can pass a numpy array - data.isel_points( - dim1=stations["dim1s"], dim2=stations["dim2s"], dim=np.array([4, 5, 6]) - ) + expected = data.isel({dim: slice(-6, None) for dim in data.dims}) + actual = data.tail(6) + assert_equal(expected, actual) - @pytest.mark.filterwarnings("ignore:Dataset.sel_points") - @pytest.mark.filterwarnings("ignore:Dataset.isel_points") - def test_sel_points(self): - data = create_test_data() + expected = data.isel({dim: slice(-5, None) for dim in data.dims}) + actual = data.tail() + assert_equal(expected, actual) - # add in a range() index - data["dim1"] = data.dim1 + with raises_regex(TypeError, "either dict-like or a single int"): + data.tail([3]) + with raises_regex(TypeError, "expected integer type"): + data.tail(dim2=3.1) + with raises_regex(ValueError, "expected positive int"): + data.tail(time=-3) - pdim1 = [1, 2, 3] - pdim2 = [4, 5, 1] - pdim3 = [1, 2, 3] - expected = data.isel_points( - dim1=pdim1, dim2=pdim2, dim3=pdim3, dim="test_coord" - ) - actual = data.sel_points( - dim1=data.dim1[pdim1], - dim2=data.dim2[pdim2], - dim3=data.dim3[pdim3], - dim="test_coord", - ) - assert_identical(expected, actual) + def test_thin(self): + data = create_test_data() - data = Dataset({"foo": (("x", "y"), np.arange(9).reshape(3, 3))}) - expected = Dataset({"foo": ("points", [0, 4, 8])}) - actual = data.sel_points(x=[0, 1, 2], y=[0, 1, 2]) - assert_identical(expected, actual) + expected = data.isel(time=slice(None, None, 5), dim2=slice(None, None, 6)) + actual = data.thin(time=5, dim2=6) + assert_equal(expected, actual) - data.coords.update({"x": [0, 1, 2], "y": [0, 1, 2]}) - expected.coords.update({"x": ("points", [0, 1, 2]), "y": ("points", [0, 1, 2])}) - actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], method="pad") - assert_identical(expected, actual) + expected = data.isel({dim: slice(None, None, 6) for dim in data.dims}) + actual = data.thin(6) + assert_equal(expected, actual) - with pytest.raises(KeyError): - data.sel_points(x=[2.5], y=[2.0], method="pad", tolerance=1e-3) + with raises_regex(TypeError, "either dict-like or a single int"): + data.thin([3]) + with raises_regex(TypeError, "expected integer type"): + data.thin(dim2=3.1) + with raises_regex(ValueError, "cannot be zero"): + data.thin(time=0) + with raises_regex(ValueError, "expected positive int"): + data.thin(time=-3) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sel_fancy(self): @@ -1766,9 +1729,8 @@ def test_reindex(self): # regression test for #279 expected = Dataset({"x": ("time", np.random.randn(5))}, {"time": range(5)}) time2 = DataArray(np.arange(5), dims="time2") - with pytest.warns(FutureWarning): + with pytest.raises(ValueError): actual = expected.reindex(time=time2) - assert_identical(actual, expected) # another regression test ds = Dataset( @@ -1784,11 +1746,10 @@ def test_reindex(self): def test_reindex_warning(self): data = create_test_data() - with pytest.warns(FutureWarning) as ws: + with pytest.raises(ValueError): # DataArray with different dimension raises Future warning ind = xr.DataArray([0.0, 1.0], dims=["new_dim"], name="ind") data.reindex(dim2=ind) - assert any(["Indexer has dimensions " in str(w.message) for w in ws]) # Should not warn ind = xr.DataArray([0.0, 1.0], dims=["dim2"], name="ind") @@ -2214,7 +2175,7 @@ def test_drop_labels_by_keyword(self): # Basic functionality. assert len(data.coords["x"]) == 2 - # This API is allowed but deprecated. + # In the future, this will break. with pytest.warns(DeprecationWarning): ds1 = data.drop(["a"], dim="x") ds2 = data.drop(x="a") @@ -2222,6 +2183,13 @@ def test_drop_labels_by_keyword(self): ds4 = data.drop(x=["a", "b"]) ds5 = data.drop(x=["a", "b"], y=range(0, 6, 2)) + # In the future, this will result in different behavior. + arr = DataArray(range(3), dims=["c"]) + with pytest.warns(FutureWarning): + data.drop(arr.coords) + with pytest.warns(FutureWarning): + data.drop(arr.indexes) + assert_array_equal(ds1.coords["x"], ["b"]) assert_array_equal(ds2.coords["x"], ["b"]) assert_array_equal(ds3.coords["x"], ["b"]) @@ -2428,18 +2396,11 @@ def test_rename_same_name(self): renamed = data.rename(newnames) assert_identical(renamed, data) - @pytest.mark.filterwarnings("ignore:The inplace argument") def test_rename_inplace(self): times = pd.date_range("2000-01-01", periods=3) data = Dataset({"z": ("x", [2, 3, 4]), "t": ("t", times)}) - copied = data.copy() - renamed = data.rename({"x": "y"}) - data.rename({"x": "y"}, inplace=True) - assert_identical(data, renamed) - assert not data.equals(copied) - assert data.dims == {"y": 3, "t": 3} - # check virtual variables - assert_array_equal(data["t.dayofyear"], [1, 2, 3]) + with pytest.raises(TypeError): + data.rename({"x": "y"}, inplace=True) def test_rename_dims(self): original = Dataset({"x": ("x", [0, 1, 2]), "y": ("x", [10, 11, 12]), "z": 42}) @@ -2702,7 +2663,7 @@ def test_set_index(self): obj = ds.set_index(x=mindex.names) assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.set_index(x=mindex.names, inplace=True) assert_identical(ds, expected) @@ -2727,9 +2688,8 @@ def test_reset_index(self): obj = ds.reset_index("x") assert_identical(obj, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.reset_index("x", inplace=True) - assert_identical(ds, expected) def test_reorder_levels(self): ds = create_test_multiindex() @@ -2740,9 +2700,8 @@ def test_reorder_levels(self): reindexed = ds.reorder_levels(x=["level_2", "level_1"]) assert_identical(reindexed, expected) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): ds.reorder_levels(x=["level_2", "level_1"], inplace=True) - assert_identical(ds, expected) ds = Dataset({}, coords={"x": [1, 2]}) with raises_regex(ValueError, "has no MultiIndex"): @@ -2882,11 +2841,8 @@ def test_update(self): assert actual_result is actual assert_identical(expected, actual) - with pytest.warns(FutureWarning, match="The inplace argument"): + with pytest.raises(TypeError): actual = data.update(data, inplace=False) - expected = data - assert actual is not expected - assert_identical(expected, actual) other = Dataset(attrs={"new": "attr"}) actual = data.copy() @@ -3411,18 +3367,6 @@ def test_groupby_reduce(self): actual = data.groupby("letters").mean(ALL_DIMS) assert_allclose(expected, actual) - def test_groupby_warn(self): - data = Dataset( - { - "xy": (["x", "y"], np.random.randn(3, 4)), - "xonly": ("x", np.random.randn(3)), - "yonly": ("y", np.random.randn(4)), - "letters": ("y", ["a", "a", "b", "b"]), - } - ) - with pytest.warns(FutureWarning): - data.groupby("x").mean() - def test_groupby_math(self): def reorder_dims(x): return x.transpose("dim1", "dim2", "dim3", "time") @@ -3768,6 +3712,28 @@ def test_to_and_from_dataframe(self): expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) + @requires_sparse + def test_from_dataframe_sparse(self): + import sparse + + df_base = pd.DataFrame( + {"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)} + ) + + ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False) + assert isinstance(ds_sparse["y"].data, sparse.COO) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["y"].data = ds_sparse["y"].data.todense() + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + + ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + def test_to_and_from_empty_dataframe(self): # GH697 expected = pd.DataFrame({"foo": []}) @@ -4920,7 +4886,7 @@ def test_filter_by_attrs(self): "temperature_10": (["t"], [0], temp10), "precipitation": (["t"], [0], precip), }, - coords={"time": (["t"], [0], dict(axis="T"))}, + coords={"time": (["t"], [0], dict(axis="T", long_name="time_in_seconds"))}, ) # Test return empty Dataset. @@ -4934,6 +4900,11 @@ def test_filter_by_attrs(self): assert_equal(new_ds["precipitation"], ds["precipitation"]) + # Test filter coordinates + new_ds = ds.filter_by_attrs(long_name="time_in_seconds") + assert new_ds["time"].long_name == "time_in_seconds" + assert not bool(new_ds.data_vars) + # Test return more than one DataArray. new_ds = ds.filter_by_attrs(standard_name="air_potential_temperature") assert len(new_ds.data_vars) == 2 @@ -5793,3 +5764,36 @@ def test_trapz_datetime(dask, which_datetime): actual2 = da.integrate("time", datetime_unit="h") assert_allclose(actual, actual2 / 24.0) + + +def test_no_dict(): + d = Dataset() + with pytest.raises(AttributeError): + d.__dict__ + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") +def test_subclass_slots(): + """Test that Dataset subclasses must explicitly define ``__slots__``. + + .. note:: + As of 0.13.0, this is actually mitigated into a FutureWarning for any class + defined outside of the xarray package. + """ + with pytest.raises(AttributeError) as e: + + class MyDS(Dataset): + pass + + assert str(e.value) == "MyDS must explicitly define __slots__" + + +def test_weakref(): + """Classes with __slots__ are incompatible with the weakref module unless they + explicitly state __weakref__ among their slots + """ + from weakref import ref + + ds = Dataset() + r = ref(ds) + assert r() is ds diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index ec63c9651eb..766a391b57f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -25,13 +25,13 @@ from xarray.testing import assert_allclose, assert_equal from . import ( + arm_xfail, assert_array_equal, has_dask, has_np113, raises_regex, requires_cftime, requires_dask, - arm_xfail, ) @@ -245,9 +245,9 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): def from_series_or_scalar(se): - try: + if isinstance(se, pd.Series): return DataArray.from_series(se) - except AttributeError: # scalar case + else: # scalar case return DataArray(se) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 56fba20ffc0..c518f528537 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from textwrap import dedent import sys +from textwrap import dedent import numpy as np import pandas as pd diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 9127eb71cb7..ee17cc39064 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -134,21 +134,21 @@ def test_da_groupby_quantile(): [("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])], ) - actual_x = array.groupby("x").quantile(0) + actual_x = array.groupby("x").quantile(0, dim=xr.ALL_DIMS) expected_x = xr.DataArray([1, 4], [("x", [1, 2])]) assert_identical(expected_x, actual_x) - actual_y = array.groupby("y").quantile(0) + actual_y = array.groupby("y").quantile(0, dim=xr.ALL_DIMS) expected_y = xr.DataArray([1, 22], [("y", [0, 1])]) assert_identical(expected_y, actual_y) - actual_xx = array.groupby("x").quantile(0, dim="x") + actual_xx = array.groupby("x").quantile(0) expected_xx = xr.DataArray( [[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])] ) assert_identical(expected_xx, actual_xx) - actual_yy = array.groupby("y").quantile(0, dim="y") + actual_yy = array.groupby("y").quantile(0) expected_yy = xr.DataArray( [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], [("x", [1, 1, 1, 2, 2]), ("y", [0, 1])], @@ -164,7 +164,7 @@ def test_da_groupby_quantile(): ) g = foo.groupby(foo.time.dt.month) - actual = g.quantile(0) + actual = g.quantile(0, dim=xr.ALL_DIMS) expected = xr.DataArray( [ 0.0, diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index f37f8d98ca8..82ee9b63f9d 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -708,7 +708,9 @@ def test_create_mask_dask(): indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]))) expected = np.array(2 * [[False, True, False]]) - actual = indexing.create_mask(indexer, (5, 5, 5), chunks_hint=((1, 1), (2, 1))) + actual = indexing.create_mask( + indexer, (5, 5, 5), da.empty((2, 3), chunks=((1, 1), (2, 1))) + ) assert actual.chunks == ((1, 1), (2, 1)) np.testing.assert_array_equal(expected, actual) @@ -716,12 +718,14 @@ def test_create_mask_dask(): (np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])) ) expected = np.array([[False, True, True]] * 2).T - actual = indexing.create_mask(indexer, (5, 2), chunks_hint=((3,), (2,))) + actual = indexing.create_mask( + indexer, (5, 2), da.empty((3, 2), chunks=((3,), (2,))) + ) assert isinstance(actual, da.Array) np.testing.assert_array_equal(expected, actual) with pytest.raises(ValueError): - indexing.create_mask(indexer, (5, 2), chunks_hint=()) + indexing.create_mask(indexer, (5, 2), da.empty((5,), chunks=(1,))) def test_create_mask_error(): diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index ed1453ce95d..c1e6c7a5ce8 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -196,6 +196,8 @@ def test_merge_compat(self): with raises_regex(ValueError, "compat=.* invalid"): ds1.merge(ds2, compat="foobar") + assert ds1.identical(ds1.merge(ds2, compat="override")) + def test_merge_auto_align(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 36e7a38151d..020a49b0114 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -8,7 +8,6 @@ import xarray as xr import xarray.plot as xplt from xarray import DataArray, Dataset -from xarray.coding.times import _import_cftime from xarray.plot.dataset_plot import _infer_meta_data from xarray.plot.plot import _infer_interval_breaks from xarray.plot.utils import ( @@ -1284,33 +1283,45 @@ class TestContour(Common2dMixin, PlotTestCase): plotfunc = staticmethod(xplt.contour) + # matplotlib cmap.colors gives an rgbA ndarray + # when seaborn is used, instead we get an rgb tuple + @staticmethod + def _color_as_tuple(c): + return tuple(c[:3]) + def test_colors(self): - # matplotlib cmap.colors gives an rgbA ndarray - # when seaborn is used, instead we get an rgb tuple - def _color_as_tuple(c): - return tuple(c[:3]) # with single color, we don't want rgb array artist = self.plotmethod(colors="k") assert artist.cmap.colors[0] == "k" artist = self.plotmethod(colors=["k", "b"]) - assert _color_as_tuple(artist.cmap.colors[1]) == (0.0, 0.0, 1.0) + assert self._color_as_tuple(artist.cmap.colors[1]) == (0.0, 0.0, 1.0) artist = self.darray.plot.contour( levels=[-0.5, 0.0, 0.5, 1.0], colors=["k", "r", "w", "b"] ) - assert _color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) - assert _color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) + assert self._color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) + assert self._color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) + # the last color is now under "over" + assert self._color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) + + def test_colors_np_levels(self): + + # https://github.com/pydata/xarray/issues/3284 + levels = np.array([-0.5, 0.0, 0.5, 1.0]) + artist = self.darray.plot.contour(levels=levels, colors=["k", "r", "w", "b"]) + assert self._color_as_tuple(artist.cmap.colors[1]) == (1.0, 0.0, 0.0) + assert self._color_as_tuple(artist.cmap.colors[2]) == (1.0, 1.0, 1.0) # the last color is now under "over" - assert _color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) + assert self._color_as_tuple(artist.cmap._rgba_over) == (0.0, 0.0, 1.0) def test_cmap_and_color_both(self): with pytest.raises(ValueError): self.plotmethod(colors="k", cmap="RdBu") - def list_of_colors_in_cmap_deprecated(self): - with pytest.raises(Exception): + def list_of_colors_in_cmap_raises_error(self): + with raises_regex(ValueError, "list of colors"): self.plotmethod(cmap=["k", "b"]) @pytest.mark.slow diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 74805b225fa..80f80a93a1c 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -1,16 +1,17 @@ -from textwrap import dedent import pickle +from textwrap import dedent + import numpy as np import pandas as pd +import pytest -from xarray import DataArray, Variable -from xarray.core.npcompat import IS_NEP18_ACTIVE import xarray as xr import xarray.ufuncs as xu +from xarray import DataArray, Variable +from xarray.core.npcompat import IS_NEP18_ACTIVE +from xarray.core.pycompat import sparse_array_type -from . import assert_equal, assert_identical, LooseVersion - -import pytest +from . import assert_equal, assert_identical param = pytest.param xfail = pytest.mark.xfail @@ -21,8 +22,12 @@ ) sparse = pytest.importorskip("sparse") -from sparse.utils import assert_eq as assert_sparse_eq # noqa -from sparse import COO, SparseArray # noqa + + +def assert_sparse_equal(a, b): + assert isinstance(a, sparse_array_type) + assert isinstance(b, sparse_array_type) + np.testing.assert_equal(a.todense(), b.todense()) def make_ndarray(shape): @@ -107,21 +112,9 @@ def test_variable_property(prop): (do("to_base_variable"), True), (do("transpose"), True), (do("unstack", dimensions={"x": {"x1": 5, "x2": 2}}), True), - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("identical", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), + (do("identical", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -163,21 +156,19 @@ def test_variable_property(prop): True, marks=xfail(reason="Missing implementation for np.nancumsum"), ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("fillna", 0), True), param( do("item", (1, 1)), False, marks=xfail(reason="'COO' object has no attribute 'item'"), ), - param(do("max"), False, marks=xfail(reason="Coercion to dense via bottleneck")), param( - do("median"), False, marks=xfail(reason="Coercion to dense via bottleneck") + do("median"), + False, + marks=xfail(reason="Missing implementation for np.nanmedian"), ), - param(do("min"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param(do("max"), False), + param(do("min"), False), param( do("no_conflicts", other=make_xrvar({"x": 10, "y": 5})), True, @@ -188,11 +179,7 @@ def test_variable_property(prop): True, # noqa marks=xfail(reason="Missing implementation for np.pad"), ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("prod"), False), param( do("quantile", q=0.5), True, @@ -201,7 +188,7 @@ def test_variable_property(prop): param( do("rank", dim="x"), False, - marks=xfail(reason="Coercion to dense via bottleneck"), + marks=xfail(reason="Only implemented for NumPy arrays (via bottleneck)"), ), param( do("reduce", func=np.sum, dim="x"), @@ -216,19 +203,15 @@ def test_variable_property(prop): param( do("shift", x=2), True, marks=xfail(reason="mixed sparse-dense operation") ), - param(do("std"), False, marks=xfail(reason="Coercion to dense via bottleneck")), param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), + do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") ), - param(do("var"), False, marks=xfail(reason="Coercion to dense via bottleneck")), - param(do("to_dict"), False, marks=xfail(reason="Coercion to dense")), + (do("sum"), False), param( - do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), - True, - marks=xfail(reason="Coercion of dense to sparse when using sparse mask"), - ), # noqa + do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") + ), + param(do("to_dict"), False, marks=xfail(reason="Coercion to dense")), + (do("where", cond=make_xrvar({"x": 10, "y": 5}) > 0.5), True), ], ids=repr, ) @@ -239,7 +222,7 @@ def test_variable_method(func, sparse_output): ret_d = func(var_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -265,7 +248,7 @@ def test_1d_variable_method(func, sparse_output): ret_d = func(var_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data) else: assert np.allclose(ret_s, ret_d) @@ -278,16 +261,18 @@ def setUp(self): self.var = xr.Variable(("x", "y"), self.data) def test_unary_op(self): - assert_sparse_eq(-self.var.data, -self.data) - assert_sparse_eq(abs(self.var).data, abs(self.data)) - assert_sparse_eq(self.var.round().data, self.data.round()) + assert_sparse_equal(-self.var.data, -self.data) + assert_sparse_equal(abs(self.var).data, abs(self.data)) + assert_sparse_equal(self.var.round().data, self.data.round()) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_univariate_ufunc(self): - assert_sparse_eq(np.sin(self.data), xu.sin(self.var).data) + assert_sparse_equal(np.sin(self.data), xu.sin(self.var).data) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_bivariate_ufunc(self): - assert_sparse_eq(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) - assert_sparse_eq(np.maximum(self.data, 0), xu.maximum(0, self.var).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) + assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(0, self.var).data) def test_repr(self): expected = dedent( @@ -300,12 +285,11 @@ def test_repr(self): def test_pickle(self): v1 = self.var v2 = pickle.loads(pickle.dumps(v1)) - assert_sparse_eq(v1.data, v2.data) + assert_sparse_equal(v1.data, v2.data) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_missing_values(self): a = np.array([0, 1, np.nan, 3]) - s = COO.from_numpy(a) + s = sparse.COO.from_numpy(a) var_s = Variable("x", s) assert np.all(var_s.fillna(2).data.todense() == np.arange(4)) assert np.all(var_s.count() == 3) @@ -380,16 +364,8 @@ def test_dataarray_property(prop): # TODO # set_index # swap_dims - param( - do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), - param( - do("equals", make_xrvar({"x": 10, "y": 5})), - False, - marks=xfail(reason="https://github.com/pydata/sparse/issues/270"), - ), + (do("broadcast_equals", make_xrvar({"x": 10, "y": 5})), False), + (do("equals", make_xrvar({"x": 10, "y": 5})), False), param( do("argmax"), True, @@ -410,11 +386,7 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.flip"), ), - param( - do("combine_first", make_xrarray({"x": 10, "y": 5})), - True, - marks=xfail(reason="mixed sparse-dense operation"), - ), + (do("combine_first", make_xrarray({"x": 10, "y": 5})), True), param( do("conjugate"), False, @@ -441,16 +413,8 @@ def test_dataarray_property(prop): marks=xfail(reason="Missing implementation for np.einsum"), ), param(do("dropna", "x"), False, marks=xfail(reason="Coercion to dense")), - param( - do("ffill", "x"), - False, - marks=xfail(reason="Coercion to dense via bottleneck.push"), - ), - param( - do("fillna", 0), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + param(do("ffill", "x"), False, marks=xfail(reason="Coercion to dense")), + (do("fillna", 0), True), param( do("interp", coords={"x": np.arange(10) + 0.5}), True, @@ -478,26 +442,16 @@ def test_dataarray_property(prop): False, marks=xfail(reason="'COO' object has no attribute 'item'"), ), - param(do("max"), False, marks=xfail(reason="Coercion to dense via bottleneck")), - param( - do("median"), False, marks=xfail(reason="Coercion to dense via bottleneck") - ), - param(do("min"), False, marks=xfail(reason="Coercion to dense via bottleneck")), + param(do("max"), False), + param(do("min"), False), param( - do("notnull"), + do("median"), False, - marks=xfail(reason="'COO' object has no attribute 'notnull'"), - ), - param( - do("pipe", np.sum, axis=1), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), - param( - do("prod"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), + marks=xfail(reason="Missing implementation for np.nanmedian"), ), + (do("notnull"), True), + (do("pipe", np.sum, axis=1), True), + (do("prod"), False), param( do("quantile", q=0.5), False, @@ -506,7 +460,7 @@ def test_dataarray_property(prop): param( do("rank", "x"), False, - marks=xfail(reason="Coercion to dense via bottleneck"), + marks=xfail(reason="Only implemented for NumPy arrays (via bottleneck)"), ), param( do("reduce", np.sum, dim="x"), @@ -524,23 +478,19 @@ def test_dataarray_property(prop): True, marks=xfail(reason="Indexing COO with more than one iterable index"), ), # noqa - param( - do("roll", x=2), - True, - marks=xfail(reason="Missing implementation for np.result_type"), - ), + (do("roll", x=2, roll_coords=True), True), param( do("sel", x=[0, 1, 2], y=[2, 3]), True, marks=xfail(reason="Indexing COO with more than one iterable index"), ), # noqa - param(do("std"), False, marks=xfail(reason="Coercion to dense via bottleneck")), param( - do("sum"), - False, - marks=xfail(reason="Missing implementation for np.result_type"), + do("std"), False, marks=xfail(reason="Missing implementation for np.nanstd") + ), + (do("sum"), False), + param( + do("var"), False, marks=xfail(reason="Missing implementation for np.nanvar") ), - param(do("var"), False, marks=xfail(reason="Coercion to dense via bottleneck")), param( do("where", make_xrarray({"x": 10, "y": 5}) > 0.5), False, @@ -558,7 +508,7 @@ def test_dataarray_method(func, sparse_output): ret_d = func(arr_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -582,7 +532,7 @@ def test_datarray_1d_method(func, sparse_output): ret_d = func(arr_d) if sparse_output: - assert isinstance(ret_s.data, SparseArray) + assert isinstance(ret_s.data, sparse.SparseArray) assert np.allclose(ret_s.data.todense(), ret_d.data, equal_nan=True) else: assert np.allclose(ret_s, ret_d, equal_nan=True) @@ -600,17 +550,20 @@ def setUp(self): self.ds_ar, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_to_dataset_roundtrip(self): x = self.sp_xr assert_equal(x, x.to_dataset("x").to_array("x")) def test_align(self): a1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "c", "d"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "c", "d"]}, ) b1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "d", "e"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "d", "e"]}, ) a2, b2 = xr.align(a1, b1, join="inner") assert isinstance(a2.data, sparse.SparseArray) @@ -647,33 +600,35 @@ def test_align_2d(self): assert np.all(B1.coords["x"] == B2.coords["x"]) assert np.all(B1.coords["y"] == B2.coords["y"]) - @pytest.mark.xfail(reason="fill value leads to sparse-dense operation") def test_align_outer(self): a1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "c", "d"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "c", "d"]}, ) b1 = xr.DataArray( - COO.from_numpy(np.arange(4)), dims=["x"], coords={"x": ["a", "b", "d", "e"]} + sparse.COO.from_numpy(np.arange(4)), + dims=["x"], + coords={"x": ["a", "b", "d", "e"]}, ) a2, b2 = xr.align(a1, b1, join="outer") assert isinstance(a2.data, sparse.SparseArray) assert isinstance(b2.data, sparse.SparseArray) - assert np.all(a2.coords["x"].data == ["a", "b", "c", "d"]) - assert np.all(b2.coords["x"].data == ["a", "b", "c", "d"]) + assert np.all(a2.coords["x"].data == ["a", "b", "c", "d", "e"]) + assert np.all(b2.coords["x"].data == ["a", "b", "c", "d", "e"]) - @pytest.mark.xfail(reason="Missing implementation for np.result_type") def test_concat(self): ds1 = xr.Dataset(data_vars={"d": self.sp_xr}) ds2 = xr.Dataset(data_vars={"d": self.sp_xr}) ds3 = xr.Dataset(data_vars={"d": self.sp_xr}) out = xr.concat([ds1, ds2, ds3], dim="x") - assert_sparse_eq( + assert_sparse_equal( out["d"].data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=0), ) out = xr.concat([self.sp_xr, self.sp_xr, self.sp_xr], dim="y") - assert_sparse_eq( + assert_sparse_equal( out.data, sparse.concatenate([self.sp_ar, self.sp_ar, self.sp_ar], axis=1) ) @@ -692,15 +647,16 @@ def test_stack(self): roundtripped = stacked.unstack() assert arr.identical(roundtripped) + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") def test_ufuncs(self): x = self.sp_xr assert_equal(np.sin(x), xu.sin(x)) def test_dataarray_repr(self): a = xr.DataArray( - COO.from_numpy(np.ones(4)), + sparse.COO.from_numpy(np.ones(4)), dims=["x"], - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) expected = dedent( """\ @@ -714,8 +670,8 @@ def test_dataarray_repr(self): def test_dataset_repr(self): ds = xr.Dataset( - data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}, - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) expected = dedent( """\ @@ -731,7 +687,9 @@ def test_dataset_repr(self): def test_sparse_dask_dataset_repr(self): pytest.importorskip("dask", minversion="2.0") - ds = xr.Dataset(data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}).chunk() + ds = xr.Dataset( + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))} + ).chunk() expected = dedent( """\ @@ -744,17 +702,17 @@ def test_sparse_dask_dataset_repr(self): def test_dataarray_pickle(self): a1 = xr.DataArray( - COO.from_numpy(np.ones(4)), + sparse.COO.from_numpy(np.ones(4)), dims=["x"], - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) a2 = pickle.loads(pickle.dumps(a1)) assert_identical(a1, a2) def test_dataset_pickle(self): ds1 = xr.Dataset( - data_vars={"a": ("x", COO.from_numpy(np.ones(4)))}, - coords={"y": ("x", COO.from_numpy(np.arange(4)))}, + data_vars={"a": ("x", sparse.COO.from_numpy(np.ones(4)))}, + coords={"y": ("x", sparse.COO.from_numpy(np.arange(4)))}, ) ds2 = pickle.loads(pickle.dumps(ds1)) assert_identical(ds1, ds2) @@ -814,8 +772,8 @@ def test_groupby_first(self): def test_groupby_bins(self): x1 = self.ds_xr x2 = self.sp_xr - m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum() - m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum() + m1 = x1.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) + m2 = x2.groupby_bins("x", bins=[0, 3, 7, 10]).sum(xr.ALL_DIMS) assert isinstance(m2.data, sparse.SparseArray) assert np.allclose(m1.data, m2.data.todense()) @@ -829,7 +787,7 @@ def test_resample(self): dims="time", ) t2 = t1.copy() - t2.data = COO(t2.data) + t2.data = sparse.COO(t2.data) m1 = t1.resample(time="QS-DEC").mean() m2 = t2.resample(time="QS-DEC").mean() assert isinstance(m2.data, sparse.SparseArray) @@ -860,7 +818,7 @@ def test_where(self): cond = a > 3 xr.DataArray(a).where(cond) - s = COO.from_numpy(a) + s = sparse.COO.from_numpy(a) cond = s > 3 xr.DataArray(s).where(cond) @@ -873,9 +831,9 @@ class TestSparseCoords: @pytest.mark.xfail(reason="Coercion of coords to dense") def test_sparse_coords(self): xr.DataArray( - COO.from_numpy(np.arange(4)), + sparse.COO.from_numpy(np.arange(4)), dims=["x"], - coords={"x": COO.from_numpy([1, 2, 3, 4])}, + coords={"x": sparse.COO.from_numpy([1, 2, 3, 4])}, ) diff --git a/xarray/util/print_versions.py b/xarray/util/print_versions.py index 85bb9db8360..4ba327913bc 100755 --- a/xarray/util/print_versions.py +++ b/xarray/util/print_versions.py @@ -1,5 +1,4 @@ """Utility functions for printing version information.""" -import codecs import importlib import locale import os