update notebooks to work with changes to dask chunking

CSHS-CWRA · Oct 1, 2024 · 9dc9fe7 · 9dc9fe7
1 parent effc01e
commit 9dc9fe7
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 144 deletions.
diff --git a/docs/notebooks/08_Getting_and_bias_correcting_CMIP6_data.ipynb b/docs/notebooks/08_Getting_and_bias_correcting_CMIP6_data.ipynb
@@ -53,7 +53,6 @@
     "import xclim\n",
     "import xclim.sdba as sdba\n",
     "from clisops.core import average, subset\n",
-    "\n",
     "from ravenpy.utilities.testdata import get_file\n",
     "\n",
     "tmp = Path(tempfile.mkdtemp())"
@@ -85,10 +84,10 @@
    },
    "outputs": [],
    "source": [
-    "# We get the basin contour for testing on a server. You can replace the getfile method by a string containing the path\n",
-    "# to your own geojson\n",
+    "# We get the basin contour for testing on a server.\n",
+    "# You can replace the getfile method by a string containing the path to your own geojson.\n",
     "\n",
-    "# Get basin contour\n",
+    "# Get basin contour.\n",
     "basin_contour = get_file(\"notebook_inputs/input.geojson\")\n",
     "\n",
     "reference_start_day = dt.datetime(1980, 12, 31)\n",
@@ -158,15 +157,16 @@
    },
    "outputs": [],
    "source": [
-    "# Prepare the filesystem that allows reading data. Data is read on the Google Cloud Services, which host a copy of the CMIP6 (and other) data.\n",
+    "# Prepare the filesystem that allows reading data.\n",
+    "# Data is read on the Google Cloud Services, which host a copy of the CMIP6 (and other) data.\n",
     "fsCMIP = gcsfs.GCSFileSystem(token=\"anon\", access=\"read_only\")\n",
     "\n",
-    "# Get the catalog info from the pangeo dataset, which basically is a list of links to the various products.\n",
+    "# Get the catalog info from the PANGEO dataset, which basically is a list of links to the various products.\n",
     "col = intake.open_esm_datastore(\n",
     "    \"https://storage.googleapis.com/cmip6/pangeo-cmip6.json\"\n",
     ")\n",
     "\n",
-    "# Print the contents of the catalog, so we can see the classification system\n",
+    "# Print the contents of the catalog, so we can see the classification system.\n",
     "display(col)"
    ]
   },
@@ -185,7 +185,8 @@
    },
    "outputs": [],
    "source": [
-    "# Get the list of models. Replace \"source_id\" with any of the catalog categories (table_id, activity_id, variable_id, etc.)\n",
+    "# Get the list of models.\n",
+    "# Replace \"source_id\" with any of the catalog categories (table_id, activity_id, variable_id, etc.)\n",
     "list(col.df.source_id.unique())"
    ]
   },
@@ -225,11 +226,10 @@
     "    member_id=\"r1i1p1f1\",\n",
     "    source_id=climate_model,\n",
     ")\n",
-    "col_subset = col.search(\n",
-    "    require_all_on=[\"source_id\"], **query\n",
-    ")  # Command that will return the filtered list\n",
+    "# Return the filtered list.\n",
+    "col_subset = col.search(require_all_on=[\"source_id\"], **query)\n",
     "\n",
-    "# Show the filtered list:\n",
+    "# Show the filtered list.\n",
     "display(col_subset.df)"
    ]
   },
@@ -249,7 +249,7 @@
    },
    "outputs": [],
    "source": [
-    "# Get the object locator object\n",
+    "# Get the object locator object.\n",
     "mapper = fsCMIP.get_mapper(col_subset.df.zstore[0])"
    ]
   },
@@ -277,17 +277,20 @@
    },
    "outputs": [],
    "source": [
-    "# Get the CMIP6 data from Google Cloud and read it in memory using xarray. This is done via \"lazy loading\" and is not actually reading the data in memory\n",
-    "# yet, but is keeping track of what it will need to get, eventually.\n",
+    "# Get the CMIP6 data from Google Cloud and read it in memory using xarray.\n",
+    "# This is done via \"lazy loading\" and is not actually reading the data in memory yet, but is keeping track of what it will need to get, eventually.\n",
     "ds = xr.open_zarr(mapper, consolidated=True)\n",
     "\n",
-    "# Convert to numpy.datetime64 object to be compatbile\n",
-    "if type(ds.time[0].values) is not type(np.datetime64(\"1980-01-01\")):\n",
-    "    ds = ds.convert_calendar(\"standard\")\n",
+    "# Convert to numpy.datetime64 object for compatibility.\n",
+    "ds = ds.convert_calendar(\"standard\")\n",
     "\n",
-    "# Extract only the dates that we really want. Again, this is done via lazy loading, and is not actually using memory at this point.\n",
+    "# Extract only the dates that we really want.\n",
+    "# Again, this is done via lazy loading, and is not actually using memory at this point.\n",
     "ds = ds.sel(time=slice(reference_start_day, reference_end_day))\n",
     "\n",
+    "# Set the date to the midnight of the given day.\n",
+    "ds = ds.assign_coords(time=ds.time.dt.floor(\"D\"))\n",
+    "\n",
     "# Use the clisops subsetting tools to extract the data for the watershed boundaries and take the spatial average\n",
     "ds = average.average_shape(ds, basin_contour)\n",
     "\n",
@@ -322,14 +325,16 @@
     "    with xr.set_options(keep_attrs=True):\n",
     "        ds = xr.open_zarr(mapper, consolidated=True)\n",
     "\n",
-    "        # Convert to numpy.datetime64 object to be compatbile\n",
-    "        if type(ds.time[0].values) is not type(np.datetime64(\"1980-01-01\")):\n",
-    "            ds = ds.convert_calendar(\"standard\")\n",
+    "        # Convert to numpy.datetime64 object for compatibility.\n",
+    "        ds = ds.convert_calendar(\"standard\")\n",
     "\n",
-    "        # Compute average over region\n",
+    "        # Set the date to the midnight of the given day.\n",
+    "        ds = ds.assign_coords(time=ds.time.dt.floor(\"D\"))\n",
+    "\n",
+    "        # Compute the average over region.\n",
     "        out = average.average_shape(ds.sel(time=slice(start, end)), geometry)\n",
     "\n",
-    "        # Convert geometry variables into attributes\n",
+    "        # Convert geometry variables into attributes.\n",
     "        attrs = {\n",
     "            key: out[key].values.item()\n",
     "            for key in out.coords\n",
@@ -431,9 +436,15 @@
     "    ERA5_reference = subset.subset_shape(\n",
     "        ds.sel(time=slice(reference_start_day, reference_end_day)), basin_contour\n",
     "    ).mean({\"latitude\", \"longitude\"})\n",
-    "    ERA5_tmin = ERA5_reference[\"t2m\"].resample(time=\"1D\").min().chunk(-1, -1, -1)\n",
-    "    ERA5_tmax = ERA5_reference[\"t2m\"].resample(time=\"1D\").max().chunk(-1, -1, -1)\n",
-    "    ERA5_pr = ERA5_reference[\"tp\"].resample(time=\"1D\").sum().chunk(-1, -1, -1)"
+    "    ERA5_tmin = (\n",
+    "        ERA5_reference.t2m.resample(time=\"1D\")\n",
+    "        .min()\n",
+    "        .chunk(\n",
+    "            time=-1,\n",
+    "        )\n",
+    "    )\n",
+    "    ERA5_tmax = ERA5_reference.t2m.resample(time=\"1D\").max().chunk(time=-1)\n",
+    "    ERA5_pr = ERA5_reference.tp.resample(time=\"1D\").sum().chunk(time=-1)"
    ]
   },
   {
@@ -455,8 +466,8 @@
    },
    "outputs": [],
    "source": [
-    "# Here we need to make sure that our units are all in the correct format. You can play around with the tools we've seen thus far to explore the units\n",
-    "# and make sure everything is consistent.\n",
+    "# Here we need to make sure that our units are all in the correct format.\n",
+    "# You can play around with the tools we've seen thus far to explore the units and make sure everything is consistent.\n",
     "\n",
     "# Let's start with precipitation:\n",
     "ERA5_pr = xclim.core.units.convert_units_to(ERA5_pr, \"mm\", context=\"hydro\")\n",
@@ -497,25 +508,25 @@
    },
    "outputs": [],
    "source": [
-    "# Use xclim utilities (sbda) to give information on the type of window used for the bias correction.\n",
+    "# Use xclim utilities (SDBA) to give information on the type of window used for the bias correction.\n",
     "group_month_window = sdba.utils.Grouper(\"time.dayofyear\", window=15)\n",
     "\n",
     "# This is an adjusting function. It builds the tool that will perform the corrections.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_pr, hist=historical_pr, nquantiles=50, kind=\"+\", group=group_month_window\n",
     ")\n",
     "\n",
-    "# Apply the correction factors on the reference period\n",
+    "# Apply the correction factors on the reference period.\n",
     "corrected_ref_precip = Adjustment.adjust(historical_pr, interp=\"linear\")\n",
     "\n",
-    "# Apply the correction factors on the future period\n",
+    "# Apply the correction factors on the future period.\n",
     "corrected_fut_precip = Adjustment.adjust(future_pr, interp=\"linear\")\n",
     "\n",
-    "# Ensure that the precipitation is non-negative, which can happen with some climate models\n",
+    "# Ensure that the precipitation is non-negative, which can happen with some climate models.\n",
     "corrected_ref_precip = corrected_ref_precip.where(corrected_ref_precip > 0, 0)\n",
     "corrected_fut_precip = corrected_fut_precip.where(corrected_fut_precip > 0, 0)\n",
     "\n",
-    "# Train the model to find the correction factors for the maximum temperature (tasmax) data\n",
+    "# Train the model to find the correction factors for the maximum temperature (tasmax) data.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_tmax,\n",
     "    hist=historical_tasmax,\n",
@@ -524,13 +535,13 @@
     "    group=group_month_window,\n",
     ")\n",
     "\n",
-    "# Apply the correction factors on the reference period\n",
+    "# Apply the correction factors on the reference period.\n",
     "corrected_ref_tasmax = Adjustment.adjust(historical_tasmax, interp=\"linear\")\n",
     "\n",
-    "# Apply the correction factors on the future period\n",
+    "# Apply the correction factors on the future period.\n",
     "corrected_fut_tasmax = Adjustment.adjust(future_tasmax, interp=\"linear\")\n",
     "\n",
-    "# Train the model to find the correction factors for the minimum temperature (tasmin) data\n",
+    "# Train the model to find the correction factors for the minimum temperature (tasmin) data.\n",
     "Adjustment = sdba.DetrendedQuantileMapping.train(\n",
     "    ref=ERA5_tmin,\n",
     "    hist=historical_tasmin,\n",
@@ -561,7 +572,8 @@
    },
    "outputs": [],
    "source": [
-    "# Convert the reference corrected data into netCDF file. We will then apply a special code to remove a dimension in the dataset to make it applicable to the RAVEN models.\n",
+    "# Convert the reference corrected data into netCDF file.\n",
+    "# We will then apply a special code to remove a dimension in the dataset to make it applicable to the RAVEN models.\n",
     "ref_dataset = xr.merge(\n",
     "    [\n",
     "        corrected_ref_precip.to_dataset(name=\"pr\"),\n",
@@ -570,11 +582,11 @@
     "    ]\n",
     ")\n",
     "\n",
-    "# Write to temporary folder\n",
+    "# Write to temporary folder.\n",
     "fn_ref = tmp / \"reference_dataset.nc\"\n",
     "ref_dataset.to_netcdf(fn_ref)\n",
     "\n",
-    "# Convert the future corrected data into netCDF file\n",
+    "# Convert the future corrected data into netCDF file.\n",
     "fut_dataset = xr.merge(\n",
     "    [\n",
     "        corrected_fut_precip.to_dataset(name=\"pr\"),\n",
@@ -610,13 +622,6 @@
     "# Compare it to the future precipitation without bias-correction.\n",
     "future_pr.plot()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {