Merge pull request #53 from climate-resource/other-gases

Add CFC12 and HFC134a
climate-resource · Jun 20, 2024 · e102591 · e102591
2 parents ab50313 + 510c979
commit e102591
Show file tree

Hide file tree

Showing 57 changed files with 3,770 additions and 286 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -40,7 +40,6 @@ jobs:
           python-version: "${{ matrix.python-version }}"
           venv-id: "tests-${{ runner.os }}"
           poetry-dependency-install-flags: "--all-extras --only 'main,tests,coverage'"
-      # TODO: change this probably to capture coverage from non-regression tests, ok for now
       - name: Run regression tests relevant for coverage
         run: |
           poetry run python scripts/write-config.py

diff --git a/LICENCES_SOURCES.md b/LICENCES_SOURCES.md
@@ -0,0 +1,46 @@
+Notes on the licences of the sources we use.
+This is not checked with CI or anything, so may not be up to date.
+
+AGAGE:
+
+- data policy: https://agage.mit.edu/data/use-agage-data
+    - offer co-authorship
+    - contact [email protected] to check
+
+
+NOAA HATS:
+
+- https://gml.noaa.gov/hats/hats_datause.html
+    - reciprocity, otherwise can be used
+
+
+NOAA CCG:
+
+- https://gml.noaa.gov/ccgg/data/datause.html
+    - reciprocity, otherwise can be used
+
+
+EPICA:
+
+- https://doi.pangaea.de/10.1594/PANGAEA.552232
+    - CC BY 3.0, can build upon
+        - https://creativecommons.org/licenses/by/3.0/
+
+Law Dome:
+
+- https://data.csiro.au/collection/csiro%3A37077v2
+    - CC BY 4.0
+        - https://creativecommons.org/licenses/by/4.0/
+
+NEEM:
+
+- https://doi.pangaea.de/10.1594/PANGAEA.899039
+    - CC BY 4.0
+        - https://creativecommons.org/licenses/by/4.0/
+
+HadCRUT5:
+
+- https://www.metoffice.gov.uk/hadobs/hadcrut5/
+    - Open Government 3 licence
+        - https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
+    - can use it for this, no worries
diff --git a/dev-config.yaml b/dev-config.yaml
diff --git a/dodo.py b/dodo.py
@@ -22,6 +22,7 @@
 from __future__ import annotations
 
 import datetime as dt
+import logging
 import os
 import time
 from collections.abc import Iterable
@@ -57,7 +58,11 @@
 See https://pydoit.org/configuration.html#configuration-at-dodo-py
 """
 
-logger = setup_logging()
+logger = setup_logging(
+    stdout_level=logging.WARNING,
+    log_file=os.environ.get("DOIT_LOG_FILE", f"doit_{RUN_ID}.log"),
+    file_level=logging.INFO,
+)
 
 
 def print_key_info() -> None:

diff --git a/notebooks-archive/01yy_process-data/0111_process-gggrn-global-mean.py b/notebooks-archive/01yy_process-data/0111_process-gggrn-global-mean.py
@@ -82,7 +82,7 @@
     raw = pd.read_csv(
         config_retrieve.gggrn.raw_dir / filename,
         skiprows=skiprows,
-        delim_whitespace=True,
+        sep=r"\s+",
     )
 
     unit = gas_units[gas]

diff --git a/notebooks/000y_retrieve-misc-data/0001_natural-earth-shape-files.py b/notebooks/000y_retrieve-misc-data/0001_natural-earth-shape-files.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.15.2
+#       jupytext_version: 1.16.1
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python

diff --git a/notebooks/001y_process-noaa-data/0010_download.py b/notebooks/001y_process-noaa-data/0010_download.py
@@ -18,12 +18,6 @@
 # Download data from the [NOAA Global Monitoring Laboratory (GML) Carbon Cycle Greenhouse Gases (CCGG) research area](https://gml.noaa.gov/ccgg/flask.html), specifically the [data page](https://gml.noaa.gov/ccgg/data/).
 #
 # For simplicity, here we just refer to this as the NOAA network. This is sort of line with what is done in [Forster et al., 2023](https://essd.copernicus.org/articles/15/2295/2023/essd-15-2295-2023.pdf), who call it the "NOAA Global Monitoring Laboratory (GML)" (which appears to be the name of the top-level program). Puzzlingly, this network seems to also be referred to as the [Global Greenhouse Gas Reference Network (GGGRN)](https://gml.noaa.gov/ccgg/data/) (TODO: ask someone who knows what the difference between the acronyms is meant to mean).
-#
-# To-do:
-#
-# - read old global-mean processing (also called 0010 but in a different folder) and extract any insights from there
-# - add in handling of in situ measurements too (in situ and flask measurements treated as different stations in M17)
-# - parameterise notebook so we can do same for CH4, N2O and SF6 observations
 
 # %% [markdown]
 # ## Imports
@@ -51,7 +45,7 @@
 
 # %% editable=true slideshow={"slide_type": ""} tags=["parameters"]
 config_file: str = "../../dev-config-absolute.yaml"  # config file
-step_config_id: str = "n2o_hats"  # config ID to select for this branch
+step_config_id: str = "co2_in-situ"  # config ID to select for this branch
 
 # %% [markdown]
 # ## Load config

diff --git a/notebooks/001y_process-noaa-data/0011_extract.py b/notebooks/001y_process-noaa-data/0011_extract.py
@@ -29,6 +29,7 @@
 from local.noaa_processing import (
     read_noaa_flask_zip,
     read_noaa_hats,
+    read_noaa_hats_combined,
     read_noaa_in_situ_zip,
 )
 
@@ -46,7 +47,7 @@
 
 # %% editable=true slideshow={"slide_type": ""} tags=["parameters"]
 config_file: str = "../../dev-config-absolute.yaml"  # config file
-step_config_id: str = "n2o_hats"  # config ID to select for this branch
+step_config_id: str = "co2_in-situ"  # config ID to select for this branch
 
 # %% [markdown]
 # ## Load config
@@ -87,7 +88,13 @@
     print(df_months)
 
 elif config_step.source == "hats":
-    df_months = read_noaa_hats(zf, gas=config_step.gas, source=config_step.source)
+    if config_step.gas in ("n2o", "sf6", "cfc11", "cfc12"):
+        df_months = read_noaa_hats_combined(
+            zf, gas=config_step.gas, source=config_step.source
+        )
+
+    else:
+        df_months = read_noaa_hats(zf, gas=config_step.gas, source=config_step.source)
 
     print("df_months")
     print(df_months)

diff --git a/notebooks/001y_process-noaa-data/0012_process_surface-flask.py b/notebooks/001y_process-noaa-data/0012_process_surface-flask.py
@@ -47,7 +47,7 @@
 
 # %% editable=true slideshow={"slide_type": ""} tags=["parameters"]
 config_file: str = "../../dev-config-absolute.yaml"  # config file
-step_config_id: str = "n2o"  # config ID to select for this branch
+step_config_id: str = "sf6"  # config ID to select for this branch
 
 # %% [markdown]
 # ## Load config

diff --git a/notebooks/001y_process-noaa-data/0013_process_in-situ.py b/notebooks/001y_process-noaa-data/0013_process_in-situ.py
@@ -85,13 +85,48 @@
 #
 # Nice and easy as this data already has everything we need.
 
-# %% editable=true slideshow={"slide_type": ""}
+# %%
 monthly_dfs_with_loc = df_months[PROCESSED_DATA_COLUMNS]
+
+# %%
+if config_step.step_config_id in ["co2", "ch4"]:
+    # There is one month where there is duplicate data for MKO,
+    # presumably from moving because of the fires.
+    # We deal with this here becuase it is such an edge case.
+    edge_case_year_month = (2023, 7)
+    edge_case_rows_select = (
+        (monthly_dfs_with_loc["year"] == edge_case_year_month[0])
+        & (monthly_dfs_with_loc["month"] == edge_case_year_month[1])
+        & (monthly_dfs_with_loc["site_code_filename"] == "mko")
+    )
+    edge_case_rows = monthly_dfs_with_loc[edge_case_rows_select]
+    exp_n_edge_case_rows = 2
+    assert edge_case_rows.shape[0] == exp_n_edge_case_rows
+
+    # Assume that a mean is fine, it seems justifiable in overall noise
+    # and not sure what else to do...
+    edge_case_row_new = (
+        edge_case_rows.groupby(list(set(edge_case_rows.columns) - {"value"}))
+        .mean()
+        .reset_index()
+    )
+
+    monthly_dfs_with_loc = pd.concat(
+        [monthly_dfs_with_loc[~edge_case_rows_select], edge_case_row_new]
+    )
+    monthly_dfs_with_loc[
+        (monthly_dfs_with_loc["year"] == edge_case_year_month[0])
+        & (monthly_dfs_with_loc["month"] == edge_case_year_month[1])
+        & (monthly_dfs_with_loc["site_code_filename"] == "mko")
+    ]
+
+# %% editable=true slideshow={"slide_type": ""}
+duplicate_entries = monthly_dfs_with_loc[
+    ["gas", "year", "month", "site_code_filename"]
+][monthly_dfs_with_loc[["gas", "year", "month", "site_code_filename"]].duplicated()]
 assert (
-    not monthly_dfs_with_loc[["gas", "year", "month", "site_code_filename"]]
-    .duplicated()
-    .any()
-), "Duplicate entries for a station in a month"
+    duplicate_entries.shape[0] == 0
+), f"Duplicate entries for a station in a month {duplicate_entries}"
 monthly_dfs_with_loc
 
 # %% editable=true slideshow={"slide_type": ""}

diff --git a/notebooks/001y_process-noaa-data/0014_process_hats.py b/notebooks/001y_process-noaa-data/0014_process_hats.py
@@ -46,7 +46,7 @@
 
 # %% editable=true slideshow={"slide_type": ""} tags=["parameters"]
 config_file: str = "../../dev-config-absolute.yaml"  # config file
-step_config_id: str = "n2o"  # config ID to select for this branch
+step_config_id: str = "hfc134a"  # config ID to select for this branch
 
 # %% [markdown]
 # ## Load config
@@ -104,7 +104,7 @@
 # countries.columns.tolist()
 
 # %%
-colours = (
+colours = tuple(
     c
     for c in [
         "tab:blue",
@@ -121,7 +121,7 @@
         "tab:cyan",
     ]
 )
-markers = (
+markers = tuple(
     m
     for m in [
         "o",
@@ -142,14 +142,14 @@
     ]
 )
 
-for station, station_df in tqdman.tqdm(
-    monthly_dfs_with_loc.groupby("site_code"), desc="Stations"
+for i, (station, station_df) in tqdman.tqdm(
+    enumerate(monthly_dfs_with_loc.groupby("site_code")), desc="Stations"
 ):
     print(station_df)
 
     fig, axes = plt.subplots(ncols=2, figsize=(12, 4))
-    colour = next(colours)
-    marker = next(markers)
+    colour = colours[i % len(colours)]
+    marker = markers[i % len(colours)]
 
     countries.plot(color="lightgray", ax=axes[0])
 
@@ -189,7 +189,7 @@
 
 # %%
 fig, axes = plt.subplots(ncols=2, figsize=(12, 4))
-colours = (
+colours = tuple(
     c
     for c in [
         "tab:blue",
@@ -206,7 +206,7 @@
         "tab:cyan",
     ]
 )
-markers = (
+markers = tuple(
     m
     for m in [
         "o",
@@ -229,11 +229,11 @@
 
 countries.plot(color="lightgray", ax=axes[0])
 
-for station, station_df in tqdman.tqdm(
-    monthly_dfs_with_loc.groupby("site_code"), desc="Stations"
+for i, (station, station_df) in tqdman.tqdm(
+    enumerate(monthly_dfs_with_loc.groupby("site_code")), desc="Stations"
 ):
-    colour = next(colours)
-    marker = next(markers)
+    colour = colours[i % len(colours)]
+    marker = markers[i % len(colours)]
 
     station_df[["longitude", "latitude"]].drop_duplicates().plot(
         x="longitude",

diff --git a/notebooks/001y_process-noaa-data/0019_noaa-network-overview.py b/notebooks/001y_process-noaa-data/0019_noaa-network-overview.py
@@ -58,22 +58,23 @@
 
 if config.ci:
     to_show: tuple[tuple[str, str, str], ...] = (
-        # ("co2", "in-situ", "process_noaa_in_situ_data"),
-        # ("co2", "surface-flask", "process_noaa_surface_flask_data"),
+        ("co2", "in-situ", "process_noaa_in_situ_data"),
+        ("co2", "surface-flask", "process_noaa_surface_flask_data"),
         ("ch4", "in-situ", "process_noaa_in_situ_data"),
         ("ch4", "surface-flask", "process_noaa_surface_flask_data"),
-        ("n2o", "surface-flask", "process_noaa_surface_flask_data"),
         ("n2o", "hats", "process_noaa_hats_data"),
+        ("sf6", "hats", "process_noaa_hats_data"),
+        ("cfc11", "hats", "process_noaa_hats_data"),
     )
 else:
     to_show = (
-        # ("co2", "in-situ", "process_noaa_in_situ_data"),
-        # ("co2", "surface-flask", "process_noaa_surface_flask_data"),
+        ("co2", "in-situ", "process_noaa_in_situ_data"),
+        ("co2", "surface-flask", "process_noaa_surface_flask_data"),
         ("ch4", "in-situ", "process_noaa_in_situ_data"),
         ("ch4", "surface-flask", "process_noaa_surface_flask_data"),
-        ("n2o", "surface-flask", "process_noaa_surface_flask_data"),
         ("n2o", "hats", "process_noaa_hats_data"),
-        # ("sf6", "surface-flask", "process_noaa_surface_flask_data"),
+        ("sf6", "hats", "process_noaa_hats_data"),
+        ("cfc11", "hats", "process_noaa_hats_data"),
     )
 
 gas_configs = {

diff --git a/notebooks/002y_process-agage-data/0020_download-agage.py b/notebooks/002y_process-agage-data/0020_download-agage.py
@@ -52,7 +52,7 @@
 
 # %% editable=true slideshow={"slide_type": ""} tags=["parameters"]
 config_file: str = "../../dev-config-absolute.yaml"  # config file
-step_config_id: str = "ccl4_gc-md_monthly"  # config ID to select for this branch
+step_config_id: str = "sf6_gc-ms-medusa_monthly"  # config ID to select for this branch
 
 # %% [markdown]
 # ## Load config