From ba9f53d81c49bb725c23f5e3779421c11b733667 Mon Sep 17 00:00:00 2001 From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> Date: Wed, 9 Nov 2022 14:31:09 +0100 Subject: [PATCH 1/6] Update Hazard.from_raster_xarray * Repeat single-valued coordinates/attributes for the number of events loaded. * Transform dates into ordinals instead of directly loading ordinals. --- climada/hazard/base.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index d2a7812aa..8a0cad8af 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -468,13 +468,14 @@ def from_raster_xarray( data_vars : dict(str, str), optional Mapping from default variable names to variable names used in the data to read. The default names are ``fraction``, ``hazard_type``, ``frequency``, - ``event_name``, and ``event_id``. If these values are not set, the method - tries to load data from the default names. If this fails, the method uses - default values for each entry. If the values are set to empty strings + ``event_name``, ``event_id``, and ``date``. If these values are not set, the + method tries to load data from the default names. If this fails, the method + uses default values for each entry. If the values are set to empty strings (``""``), no data is loaded and the default values are used exclusively. See examples for details. Default values are: + * ``date``: The ``event`` coordinate interpreted as date * ``fraction``: 1.0 where intensity is not zero, else zero * ``hazard_type``: Empty string * ``frequency``: 1.0 for every event @@ -619,7 +620,7 @@ def from_raster_xarray( # If the data is a string, open the respective file if not isinstance(data, xr.Dataset): LOGGER.info("Loading Hazard from file: %s", data) - data = xr.open_dataset(data) + data: xr.Dataset = xr.open_dataset(data) else: LOGGER.info("Loading Hazard from xarray Dataset") @@ -708,6 +709,21 @@ def strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray: raise ValueError(f"'{array.name}' data must be larger than zero") return array.values + def date_to_ordinal_accessor(array: xr.DataArray) -> np.ndarray: + """Take a DataArray and transform it into ordinals""" + if np.issubdtype(array.dtype, np.integer): + # Assume that data is ordinals + return array.values + + # Try transforming to ordinals + return np.array(u_dt.datetime64_to_ordinal(array.values)) + + def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: + if values.size == 1: + return np.array(list(itertools.repeat(values.flat[0], times))) + + return values + # Create a DataFrame storing access information for each of data_vars # NOTE: Each row will be passed as arguments to # `load_from_xarray_or_return_default`, see its docstring for further @@ -738,8 +754,8 @@ def strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray: lambda x: to_csr_matrix(default_accessor(x)), default_accessor, strict_positive_int_accessor, - lambda x: list(default_accessor(x).flat), # list, not np.array - strict_positive_int_accessor, + lambda x: list(maybe_repeat(default_accessor(x), num_events).flat), + lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events), ], ) ) From 892031a48ba9b96e40773498717880a8ab8dbb90 Mon Sep 17 00:00:00 2001 From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> Date: Wed, 23 Nov 2022 16:30:34 +0100 Subject: [PATCH 2/6] Update Hazard.from_raster_xarray * Optionally repeat 'frequency' data. * Set default 'fraction' to None, complying with the new __init__. * Add tests for repeating single-valued data. * Modify tests to check for correct fraction initialization. --- climada/hazard/base.py | 27 +++++++----- climada/hazard/test/test_base_xarray.py | 55 +++++++++++++++++++------ 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index 0343499e5..dc16e4cdb 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -411,7 +411,7 @@ def set_vector(self, *args, **kwargs): @classmethod def from_raster_xarray( cls, - data: Union[xr.Dataset, str], + data: Union[xr.Dataset, str, pathlib.Path], hazard_type: str, intensity_unit: str, *, @@ -448,6 +448,8 @@ def from_raster_xarray( three coordinates does not exist, use ``Dataset.expand_dims`` (see https://docs.xarray.dev/en/stable/generated/xarray.Dataset.expand_dims.html and Examples) before loading the Dataset as Hazard. + * Single-valued data for variables ``frequency``. ``event_name``, and + ``event_date`` will be broadcast to every event. * To avoid confusion in the call signature, several parameters are keyword-only arguments. * The attributes ``Hazard.tag.haz_type`` and ``Hazard.unit`` currently cannot be @@ -484,7 +486,8 @@ def from_raster_xarray( Default values are: * ``date``: The ``event`` coordinate interpreted as date - * ``fraction``: 1.0 where intensity is not zero, else zero + * ``fraction``: ``None``, which results in a value of 1.0 everywhere, see the + :meth:`Hazard.__init__` for details. * ``hazard_type``: Empty string * ``frequency``: 1.0 for every event * ``event_name``: String representation of the event time @@ -504,6 +507,7 @@ def from_raster_xarray( -------- The use of this method is straightforward if the Dataset contains the data with expected names. + >>> dset = xr.Dataset( >>> dict( >>> intensity=( @@ -520,6 +524,7 @@ def from_raster_xarray( >>> hazard = Hazard.from_raster_xarray(dset, "", "") For non-default coordinate names, use the ``coordinate_vars`` argument. + >>> dset = xr.Dataset( >>> dict( >>> intensity=( @@ -539,6 +544,7 @@ def from_raster_xarray( Coordinates can be different from the actual dataset dimensions. The following loads the data with coordinates ``longitude`` and ``latitude`` (default names): + >>> dset = xr.Dataset( >>> dict(intensity=(["time", "y", "x"], [[[0, 1, 2], [3, 4, 5]]])), >>> dict( @@ -554,6 +560,7 @@ def from_raster_xarray( Optional data is read from the dataset if the default keys are found. Users can specify custom variables in the data, or that the default keys should be ignored, with the ``data_vars`` argument. + >>> dset = xr.Dataset( >>> dict( >>> intensity=( @@ -593,6 +600,7 @@ def from_raster_xarray( If your read single-event data your dataset probably will not have a time dimension. As long as a time *coordinate* exists, however, this method will automatically promote it to a dataset dimension and load the data: + >>> dset = xr.Dataset( >>> dict( >>> intensity=( @@ -610,6 +618,7 @@ def from_raster_xarray( If one coordinate is missing altogehter, you must add it or expand the dimensions before loading the dataset: + >>> dset = xr.Dataset( >>> dict( >>> intensity=( @@ -720,7 +729,7 @@ def date_to_ordinal_accessor(array: xr.DataArray) -> np.ndarray: """Take a DataArray and transform it into ordinals""" if np.issubdtype(array.dtype, np.integer): # Assume that data is ordinals - return array.values + return strict_positive_int_accessor(array) # Try transforming to ordinals return np.array(u_dt.datetime64_to_ordinal(array.values)) @@ -746,11 +755,7 @@ def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: user_key=None, # The default value for each attribute default_value=[ - to_csr_matrix( - xr.apply_ufunc( - lambda x: np.where(x != 0, 1, 0), data[intensity] - ).values - ), + None, np.ones(num_events), np.array(range(num_events), dtype=int) + 1, list(data[coords["event"]].values), @@ -759,7 +764,7 @@ def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: # The accessor for the data in the Dataset accessor=[ lambda x: to_csr_matrix(default_accessor(x)), - default_accessor, + lambda x: maybe_repeat(default_accessor(x), num_events), strict_positive_int_accessor, lambda x: list(maybe_repeat(default_accessor(x), num_events).flat), lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events), @@ -866,7 +871,9 @@ def vshape(array): return array.shape # Check size for read data - if not np.array_equal(vshape(val), vshape(default_value)): + if default_value is not None and ( + not np.array_equal(vshape(val), vshape(default_value)) + ): raise RuntimeError( f"'{user_key if user_key else default_key}' must have shape " f"{vshape(default_value)}, but shape is {vshape(val)}" diff --git a/climada/hazard/test/test_base_xarray.py b/climada/hazard/test/test_base_xarray.py index 7aa3610a2..348a4860a 100644 --- a/climada/hazard/test/test_base_xarray.py +++ b/climada/hazard/test/test_base_xarray.py @@ -19,7 +19,6 @@ Test xarray reading capabilities of Hazard base class. """ -import os import unittest import datetime as dt import numpy as np @@ -86,8 +85,10 @@ def _assert_default_values(self, hazard): ) # Fraction default + self.assertEqual(hazard.fraction.nnz, 0) + np.testing.assert_array_equal(hazard.fraction.shape, hazard.intensity.shape) np.testing.assert_array_equal( - hazard.fraction.toarray(), [[0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]] + hazard.fraction.toarray(), np.zeros_like(hazard.intensity.toarray()) ) def _assert_default_types(self, hazard): @@ -207,16 +208,43 @@ def test_data_vars(self): ) # Integer data assertions - for key in ("event_id", "date"): - dset = dataset.copy(deep=True) - dset[key] = np.array(range(size), dtype=np.float64) + 3.5 - with self.assertRaises(TypeError) as cm: - Hazard.from_raster_xarray(dset, "", "") - self.assertIn(f"'{key}' data array must be integers", str(cm.exception)) - dset[key] = np.linspace(0, 10, size, dtype=np.int64) - with self.assertRaises(ValueError) as cm: - Hazard.from_raster_xarray(dset, "", "") - self.assertIn(f"'{key}' data must be larger than zero", str(cm.exception)) + dset = dataset.copy(deep=True) + dset["event_id"] = np.array(range(size), dtype=np.float64) + 3.5 + with self.assertRaises(TypeError) as cm: + Hazard.from_raster_xarray(dset, "", "") + self.assertIn("'event_id' data array must be integers", str(cm.exception)) + dset["event_id"] = np.linspace(0, 10, size, dtype=np.int64) + with self.assertRaises(ValueError) as cm: + Hazard.from_raster_xarray(dset, "", "") + self.assertIn("'event_id' data must be larger than zero", str(cm.exception)) + + # Date as datetime + date_str = [f"2000-01-{i:02}" for i in range(1, size + 1)] + dataset["date"] = date_str + hazard = Hazard.from_raster_xarray(dataset, "", "") + np.testing.assert_array_equal( + hazard.date, + [dt.datetime(2000, 1, i).toordinal() for i in range(1, size + 1)], + ) + + def test_data_vars_repeat(self): + """Test if suitable data vars are repeated as expected""" + dataset = xr.open_dataset(self.netcdf_path) + size = dataset.sizes["time"] + + # Set optionals in the dataset + frequency = [1.5] + event_name = ["bla"] + date = 1 + dataset["event_name"] = event_name + dataset["date"] = date + dataset["frequency"] = frequency + + # Check if single-valued arrays are repeated + hazard = Hazard.from_raster_xarray(dataset, "", "") + np.testing.assert_array_equal(hazard.date, [date] * size) + np.testing.assert_array_equal(hazard.event_name, event_name * size) + np.testing.assert_array_equal(hazard.frequency, frequency * size) def test_nan(self): """Check handling of NaNs in original data""" @@ -282,7 +310,8 @@ def test_missing_dims(self): np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]) - np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 1, 1, 1, 1, 1]]) + self.assertEqual(hazard.fraction.nnz, 0) + np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0]]) # Now drop variable altogether, should raise an error ds = ds.drop_vars("time") From 8fb116a1ccb66b048e25d621d9731a34281777f3 Mon Sep 17 00:00:00 2001 From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> Date: Tue, 20 Dec 2022 17:12:55 +0100 Subject: [PATCH 3/6] Add `rechunk` parameter to `Hazard.from_raster_xarray` Update docstrings and tests. --- climada/hazard/base.py | 29 +++++++++++++++-- climada/hazard/test/test_base_xarray.py | 41 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index c0806df9c..c6ff4108e 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -419,6 +419,7 @@ def from_raster_xarray( coordinate_vars: Optional[Dict[str, str]] = None, data_vars: Optional[Dict[str, str]] = None, crs: str = DEF_CRS, + rechunk: bool = False, ): """Read raster-like data from an xarray Dataset or a raster data file @@ -485,9 +486,10 @@ def from_raster_xarray( examples for details. Default values are: + * ``date``: The ``event`` coordinate interpreted as date - * ``fraction``: ``None``, which results in a value of 1.0 everywhere, see the - :meth:`Hazard.__init__` for details. + * ``fraction``: ``None``, which results in a value of 1.0 everywhere, see + :py:meth:`Hazard.__init__` for details. * ``hazard_type``: Empty string * ``frequency``: 1.0 for every event * ``event_name``: String representation of the event time @@ -497,6 +499,14 @@ def from_raster_xarray( to ``EPSG:4326`` (WGS 84), defined by ``climada.util.constants.DEF_CRS``. See https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input for further information on how to specify the coordinate system. + rechunk : bool, optional + Rechunk the dataset before flattening. This might have serious performance + implications. Rechunking in general is expensive, but it might be less + expensive than stacking a poorly-chunked array. One event being stored in + one chunk would be the optimal configuration. If ``rechunk=True``, this will + be forced by rechunking the data. Ideally, you would select the chunks in + that manner when opening the dataset before passing it to this function. + Defaults to ``False``. Returns ------- @@ -678,6 +688,16 @@ def from_raster_xarray( data = data.expand_dims(coord) dims[key] = data[coord].dims + # Try to rechunk the data to optimize the stack operation afterwards. + if rechunk: + # We want one event to be contained in one chunk + chunks = {dim: -1 for dim in dims["longitude"]} + chunks.update({dim: -1 for dim in dims["latitude"]}) + + # Chunks can be auto-sized along the event dimensions + chunks.update({dim: "auto" for dim in dims["event"]}) + data = data.chunk(chunks=chunks) + # Stack (vectorize) the entire dataset into 2D (time, lat/lon) # NOTE: We want the set union of the dimensions, but Python 'set' does not # preserve order. However, we want longitude to run faster than latitude. @@ -735,6 +755,11 @@ def date_to_ordinal_accessor(array: xr.DataArray) -> np.ndarray: return np.array(u_dt.datetime64_to_ordinal(array.values)) def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: + """Return the array or repeat a single-valued array + + If ``values`` has size 1, return an array that repeats this value ``times`` + times. If the size is different, just return the array. + """ if values.size == 1: return np.array(list(itertools.repeat(values.flat[0], times))) diff --git a/climada/hazard/test/test_base_xarray.py b/climada/hazard/test/test_base_xarray.py index 348a4860a..f58127434 100644 --- a/climada/hazard/test/test_base_xarray.py +++ b/climada/hazard/test/test_base_xarray.py @@ -20,6 +20,7 @@ """ import unittest +from unittest.mock import patch, MagicMock import datetime as dt import numpy as np from scipy.sparse import csr_matrix @@ -419,6 +420,46 @@ def test_2D_coordinates(self): np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) self._assert_intensity_fraction(hazard) + def test_load_dataset_rechunk(self): + """Load the data from an opened dataset and force rechunking""" + dataset = xr.open_dataset(self.netcdf_path) + hazard = Hazard.from_raster_xarray( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + rechunk=True, + ) + np.testing.assert_array_equal( + hazard.centroids.lat, [100, 100, 100, 200, 200, 200] + ) + np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) + self._assert_intensity_fraction(hazard) + + # Assert that .chunk is called the right way + with patch("xarray.Dataset.chunk") as mock: + mock.return_value = dataset + dataset = xr.open_dataset(self.netcdf_path) + Hazard.from_raster_xarray( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + rechunk=True, + ) + # First latitude dim, then longitude dim, then event dim + mock.assert_called_once_with(chunks=dict(y=-1, x=-1, time="auto")) + + # Should not be called by default + mock.reset_mock() + Hazard.from_raster_xarray( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + ) + mock.assert_not_called() + def test_2D_time(self): """Test if stacking multiple time dimensions works out""" time = np.array( From db6d39402495ae4b08c4b571cd211d0990781c5c Mon Sep 17 00:00:00 2001 From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> Date: Tue, 20 Dec 2022 17:15:14 +0100 Subject: [PATCH 4/6] Fix docstring rendering of `Hazard.from_raster_xarray` Move Notes below Parameters. --- climada/hazard/base.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index c6ff4108e..d48b0238b 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -442,23 +442,6 @@ def from_raster_xarray( meaning that the object can be used in all CLIMADA operations without throwing an error due to missing data or faulty data types. - Notes - ----- - * Single-valued coordinates given by ``coordinate_vars``, that are not proper - dimensions of the data, are promoted to dimensions automatically. If one of the - three coordinates does not exist, use ``Dataset.expand_dims`` (see - https://docs.xarray.dev/en/stable/generated/xarray.Dataset.expand_dims.html - and Examples) before loading the Dataset as Hazard. - * Single-valued data for variables ``frequency``. ``event_name``, and - ``event_date`` will be broadcast to every event. - * To avoid confusion in the call signature, several parameters are keyword-only - arguments. - * The attributes ``Hazard.tag.haz_type`` and ``Hazard.unit`` currently cannot be - read from the Dataset. Use the method parameters to set these attributes. - * This method does not read coordinate system metadata. Use the ``crs`` parameter - to set a custom coordinate system identifier. - * This method **does not** read lazily. Single data arrays must fit into memory. - Parameters ---------- data : xarray.Dataset or str @@ -513,6 +496,23 @@ def from_raster_xarray( hazard : climada.Hazard A hazard object created from the input data + Notes + ----- + * Single-valued coordinates given by ``coordinate_vars``, that are not proper + dimensions of the data, are promoted to dimensions automatically. If one of the + three coordinates does not exist, use ``Dataset.expand_dims`` (see + https://docs.xarray.dev/en/stable/generated/xarray.Dataset.expand_dims.html + and Examples) before loading the Dataset as Hazard. + * Single-valued data for variables ``frequency``. ``event_name``, and + ``event_date`` will be broadcast to every event. + * To avoid confusion in the call signature, several parameters are keyword-only + arguments. + * The attributes ``Hazard.tag.haz_type`` and ``Hazard.unit`` currently cannot be + read from the Dataset. Use the method parameters to set these attributes. + * This method does not read coordinate system metadata. Use the ``crs`` parameter + to set a custom coordinate system identifier. + * This method **does not** read lazily. Single data arrays must fit into memory. + Examples -------- The use of this method is straightforward if the Dataset contains the data with From e47e305749b93f33d9c113c92722356b944bc6e5 Mon Sep 17 00:00:00 2001 From: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:22:00 +0100 Subject: [PATCH 5/6] black is beautyful. Co-authored-by: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> --- climada/hazard/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index d48b0238b..c67431a80 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -896,8 +896,8 @@ def vshape(array): return array.shape # Check size for read data - if default_value is not None and ( - not np.array_equal(vshape(val), vshape(default_value)) + if default_value is not None and not np.array_equal( + vshape(val), vshape(default_value) ): raise RuntimeError( f"'{user_key if user_key else default_key}' must have shape " From 9649b7d5ff9f20954df76b593599fb674ee2c0ab Mon Sep 17 00:00:00 2001 From: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:25:01 +0100 Subject: [PATCH 6/6] superfluous, but well aligned --- climada/hazard/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/climada/hazard/base.py b/climada/hazard/base.py index c67431a80..b9ded3b48 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -790,7 +790,7 @@ def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: accessor=[ lambda x: to_csr_matrix(default_accessor(x)), lambda x: maybe_repeat(default_accessor(x), num_events), - strict_positive_int_accessor, + lambda x: strict_positive_int_accessor(x), lambda x: list(maybe_repeat(default_accessor(x), num_events).flat), lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events), ],