Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't set encoding attributes on bounds variables. #2965

Merged
merged 18 commits into from
Jun 25, 2019
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ Enhancements
Bug fixes
~~~~~~~~~

- Don't set encoding attributes on bounds variables when writing to netCDF.
:py:meth:`xr.open_mfdataset` sets variable encodings to that of variables
in first file.(:issue:`2436`, :issue:`2921`)
By `Deepak Cherian <https://github.com/dcherian>`_.
- NetCDF4 output: variables with unlimited dimensions must be chunked (not
contiguous) on output. (:issue:`1849`)
By `James McCreight <https://github.com/jmccreight>`_.
Expand Down
3 changes: 3 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,

combined._file_obj = _MultiFileCloser(file_objs)
combined.attrs = datasets[0].attrs
for v in combined.variables:
if v in datasets[0]:
combined[v].encoding = datasets[0][v].encoding
dcherian marked this conversation as resolved.
Show resolved Hide resolved
return combined


Expand Down
85 changes: 75 additions & 10 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .coding import strings, times, variables
from .coding.variables import SerializationWarning
from .core import duck_array_ops, indexing
from .core.common import contains_cftime_datetimes
from .core.pycompat import dask_array_type
from .core.variable import IndexVariable, Variable, as_variable

Expand Down Expand Up @@ -354,6 +355,51 @@ def _update_bounds_attributes(variables):
bounds_attrs.setdefault('calendar', attrs['calendar'])


def _update_bounds_encoding(variables):
"""Adds time encoding to time bounds variables.

Variables handling time bounds ("Cell boundaries" in the CF
conventions) do not necessarily carry the necessary attributes to be
decoded. This copies the encoding from the time variable to the
associated bounds variable so that we write CF-compliant files.

See Also:

http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/
cf-conventions.html#cell-boundaries

https://github.com/pydata/xarray/issues/2565
"""

# For all time variables with bounds
for v in variables.values():
attrs = v.attrs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A general question -- would we consider 'bounds' to be an encoding parameter (like 'units' or 'calendar')? In other words should we expect it to be in the encoding dictionary or attrs dictionary at this stage? I feel like it may be more intuitive as part of encoding, but currently I know that we don't treat it that way when decoding files.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my mental model, encoding attributes are those that control on-disk representation of the data. bounds counts as an attr to my mind since it's an attribute that links the variable to another variable.

A definition or list of what goes in encoding and what goes in attrs would make a good addition to the docs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my mental model, encoding attributes are those that control on-disk representation of the data.

I think this is fair; I guess I was going off of the mental model of encoding parameters defined as "attributes that are potentially required for decoding all the variables in a file," in which case 'bounds' could qualify. I think your definition is probably cleaner, because it requires that encoding parameters control how the variable they are attached to is represented on disk (as opposed to another variable).

encoding = v.encoding
has_date_units = 'units' in encoding and 'since' in encoding['units']
is_datetime_type = (np.issubdtype(v.dtype, np.datetime64) or
contains_cftime_datetimes(v))

if (is_datetime_type and not has_date_units and
'bounds' in attrs and attrs['bounds'] in variables):
warnings.warn("Variable '{0}' has datetime type and a "
"bounds variable but {0}.encoding does not have "
"units specified. The units encodings for '{0}' "
"and '{1}' will be determined independently "
"and may not be equal, counter to CF-conventions. "
"If this is a concern, specify a units encoding for "
"'{0}' before writing to a file."
.format(v.name, attrs['bounds']),
UserWarning)

if has_date_units and 'bounds' in attrs:
if attrs['bounds'] in variables:
bounds_encoding = variables[attrs['bounds']].encoding
bounds_encoding.setdefault('units', encoding['units'])
if 'calendar' in encoding:
bounds_encoding.setdefault('calendar',
encoding['calendar'])


def decode_cf_variables(variables, attributes, concat_characters=True,
mask_and_scale=True, decode_times=True,
decode_coords=True, drop_variables=None,
Expand Down Expand Up @@ -491,8 +537,6 @@ def cf_decoder(variables, attributes,
"""
Decode a set of CF encoded variables and attributes.

See Also, decode_cf_variable

Parameters
----------
variables : dict
Expand All @@ -514,6 +558,10 @@ def cf_decoder(variables, attributes,
A dictionary mapping from variable name to xarray.Variable objects.
decoded_attributes : dict
A dictionary mapping from attribute name to values.

See also
--------
decode_cf_variable
"""
variables, attributes, _ = decode_cf_variables(
variables, attributes, concat_characters, mask_and_scale, decode_times)
Expand Down Expand Up @@ -594,14 +642,12 @@ def encode_dataset_coordinates(dataset):

def cf_encoder(variables, attributes):
"""
A function which takes a dicts of variables and attributes
and encodes them to conform to CF conventions as much
as possible. This includes masking, scaling, character
array handling, and CF-time encoding.

Decode a set of CF encoded variables and attributes.
Encode a set of CF encoded variables and attributes.
Takes a dicts of variables and attributes and encodes them
to conform to CF conventions as much as possible.
This includes masking, scaling, character array handling,
and CF-time encoding.

See Also, decode_cf_variable

Parameters
----------
Expand All @@ -617,8 +663,27 @@ def cf_encoder(variables, attributes):
encoded_attributes : dict
A dictionary mapping from attribute name to value

See also: encode_cf_variable
See also
--------
decode_cf_variable, encode_cf_variable
"""

# add encoding for time bounds variables if present.
_update_bounds_encoding(variables)

new_vars = OrderedDict((k, encode_cf_variable(v, name=k))
for k, v in variables.items())

# Remove attrs from bounds variables (issue #2921)
for var in new_vars.values():
bounds = var.attrs['bounds'] if 'bounds' in var.attrs else None
if bounds and bounds in new_vars:
# see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries # noqa
for attr in ['units', 'standard_name', 'axis', 'positive',
'calendar', 'long_name', 'leap_month', 'leap_year',
'month_lengths']:
if attr in new_vars[bounds].attrs and attr in var.attrs:
if new_vars[bounds].attrs[attr] == var.attrs[attr]:
new_vars[bounds].attrs.pop(attr)

Copy link
Collaborator

@mathause mathause May 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we issue a warning here?

else:
    warning.warn("The attribute 'units' is not the same in the variable "
"'time' and it's associated bounds 'time_bnds',"
" which is not cf-compliant.")

or some such

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to? xarray allows writing CF-non-compliant files anyway...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the warning you added is the perfect approach. It will still be issued in the case of @mathause's example, but will still allow a user to write a non-CF-compliant file without a warning if the encoding attributes do not need to be computed on the fly.

return new_vars, attributes
20 changes: 20 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2486,6 +2486,26 @@ def test_attrs_mfdataset(self):
'no attribute'):
actual.test2

def test_encoding_mfdataset(self):
original = Dataset({'foo': ('t', np.random.randn(10)),
't': ('t', pd.date_range(start='2010-01-01',
periods=10,
freq='1D'))})
original.t.encoding['units'] = 'days since 2010-01-01'

with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
ds1 = original.isel(t=slice(5))
ds2 = original.isel(t=slice(5, 10))
ds1.t.encoding['units'] = 'days since 2010-01-01'
ds2.t.encoding['units'] = 'days since 2000-01-01'
ds1.to_netcdf(tmp1)
ds2.to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2]) as actual:
assert actual.t.encoding['units'] == original.t.encoding['units'] # noqa
assert actual.t.encoding['units'] == ds1.t.encoding['units'] # noqa
assert actual.t.encoding['units'] != ds2.t.encoding['units'] # noqa

def test_preprocess_mfdataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp:
Expand Down
49 changes: 47 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import pandas as pd
import pytest

from xarray import DataArray, Variable, coding, decode_cf
from xarray import DataArray, Dataset, Variable, coding, decode_cf
from xarray.coding.times import (
_import_cftime, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime)
from xarray.coding.variables import SerializationWarning
from xarray.conventions import _update_bounds_attributes
from xarray.conventions import _update_bounds_attributes, cf_encoder
from xarray.core.common import contains_cftime_datetimes
from xarray.testing import assert_equal

Expand Down Expand Up @@ -671,6 +671,51 @@ def test_decode_cf_time_bounds():
_update_bounds_attributes(ds.variables)


@requires_cftime_or_netCDF4
def test_encode_time_bounds():
dcherian marked this conversation as resolved.
Show resolved Hide resolved

time = pd.date_range('2000-01-16', periods=1)
time_bounds = pd.date_range('2000-01-01', periods=2, freq='MS')
ds = Dataset(dict(time=time, time_bounds=time_bounds))
ds.time.attrs = {'bounds': 'time_bounds'}
ds.time.encoding = {'calendar': 'noleap',
'units': 'days since 2000-01-01'}

expected = dict()
# expected['time'] = Variable(data=np.array([15]), dims=['time'])
expected['time_bounds'] = Variable(data=np.array([0, 31]),
dims=['time_bounds'])

encoded, _ = cf_encoder(ds.variables, ds.attrs)
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert 'units' not in encoded['time_bounds'].attrs

# if time_bounds attrs are same as time attrs, it doesn't matter
ds.time_bounds.encoding = {'calendar': 'noleap',
'units': 'days since 2000-01-01'}
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
ds.attrs)
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert 'units' not in encoded['time_bounds'].attrs

# for CF-noncompliant case of time_bounds attrs being different from
# time attrs; preserve them for faithful roundtrip
ds.time_bounds.encoding = {'calendar': 'noleap',
'units': 'days since 1849-01-01'}
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
ds.attrs)
with pytest.raises(AssertionError):
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert encoded['time_bounds'].attrs['units'] == ds.time_bounds.encoding['units'] # noqa

ds.time.encoding = {}
with pytest.warns(UserWarning):
cf_encoder(ds.variables, ds.attrs)


@pytest.fixture(params=_ALL_CALENDARS)
def calendar(request):
return request.param
Expand Down