Skip to content

Commit

Permalink
Combine CF Unsigned and Mask handling
Browse files Browse the repository at this point in the history
  • Loading branch information
djhoese committed Jul 24, 2024
1 parent e1baa93 commit cae77aa
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 116 deletions.
224 changes: 117 additions & 107 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,18 @@ def encode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

dtype = np.dtype(encoding.get("dtype", data.dtype))
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
is_unsigned = encoding.get("_Unsigned", "false") == "true"
# only used for _Unsigned cases
signed_dtype = np.dtype(
encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
)
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")
# to properly handle _FillValue/missing_value below [a], [b]
# we need to check if unsigned data is written as signed data
unsigned = encoding.get("_Unsigned") is not None
fill_value = None

fv_exists = fv is not None
mv_exists = mv is not None
Expand All @@ -324,23 +331,28 @@ def encode(self, variable: Variable, name: T_Name = None):

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
# [a] need to skip this if _Unsigned is available
if not unsigned:
encoding["_FillValue"] = dtype.type(fv)
encoding["_FillValue"] = (
self._encode_unsigned_fill_value(name, fv, signed_dtype)
if is_unsigned
else dtype.type(fv)
)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)

if mv_exists:
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
# [b] need to provide mv verbatim if _Unsigned is available
encoding["missing_value"] = attrs.get(
"_FillValue",
(dtype.type(mv) if not unsigned else mv),
(
self._encode_unsigned_fill_value(name, fv, signed_dtype)
if is_unsigned
else dtype.type(mv)
),
)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)

# apply fillna
if not pd.isnull(fill_value):
if fill_value is not None and not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
Expand All @@ -349,46 +361,112 @@ def encode(self, variable: Variable, name: T_Name = None):
else:
data = duck_array_ops.fillna(data, fill_value)

if fill_value is not None and is_unsigned:
pop_to(encoding, attrs, "_Unsigned")
# XXX: Is this actually needed? Doesn't the backend handle this?
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
attrs["_FillValue"] = fill_value

return Variable(dims, data, attrs, encoding, fastpath=True)

def _encode_unsigned_fill_value(
self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike
) -> Any:
try:
# user provided the on-disk signed fill
if hasattr(fill_value, "item"):
# if numpy type, convert to python native integer to determine overflow
# otherwise numpy unsigned ints will silently cast to the signed counterpart
fill_value = fill_value.item()
new_fill = signed_dtype.type(fill_value)
except OverflowError:
warnings.warn(
f"variable {name!r} will be stored as signed integers "
f"but _FillValue attribute can't be represented as a "
f"signed integer.",
SerializationWarning,
stacklevel=3,
)
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
)
return new_fill

def decode(self, variable: Variable, name: T_Name = None):
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
return variable

if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)
dims, data, attrs, encoding = unpack_for_decoding(variable)

# dims, data, attrs, encoding = unpack_for_decoding(variable)
# Even if _Unsigned is use, retain on-disk _FillValue
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
if "_Unsigned" in attrs:
unsigned = pop_to(attrs, encoding, "_Unsigned")

if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if "_FillValue" in raw_fill_dict:
new_fill = np.array(
raw_fill_dict["_FillValue"], dtype=data.dtype
)
encoded_fill_values.remove(raw_fill_dict["_FillValue"])
# use view here to prevent OverflowError
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if "_FillValue" in raw_fill_dict:
new_fill = signed_dtype.type(raw_fill_dict["_FillValue"])
encoded_fill_values.remove(raw_fill_dict["_FillValue"])
encoded_fill_values.add(new_fill)
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)


def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
Expand Down Expand Up @@ -506,74 +584,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return variable


class UnsignedIntegerCoder(VariableCoder):
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
if variable.encoding.get("_Unsigned", "false") == "true":
dims, data, attrs, encoding = unpack_for_encoding(variable)

pop_to(encoding, attrs, "_Unsigned")
# we need the on-disk type here
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
if "_FillValue" in attrs:
try:
# user provided the on-disk signed fill
new_fill = signed_dtype.type(attrs["_FillValue"])
except OverflowError:
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
.view(signed_dtype)
.item()
)
attrs["_FillValue"] = new_fill
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if "_Unsigned" in variable.attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)
unsigned = pop_to(attrs, encoding, "_Unsigned")

if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if "_FillValue" in attrs:
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if "_FillValue" in attrs:
new_fill = signed_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

Expand Down
2 changes: 0 additions & 2 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def encode_cf_variable(
times.CFTimedeltaCoder(),
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NativeEnumCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
Expand Down Expand Up @@ -279,7 +278,6 @@ def decode_cf_variable(

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
variables.CFMaskCoder(),
variables.CFScaleOffsetCoder(),
]:
Expand Down
Loading

0 comments on commit cae77aa

Please sign in to comment.