Skip to content

Commit

Permalink
Add multimodel tests using samples of CMIP6 data (#856)
Browse files Browse the repository at this point in the history
Co-authored-by: Bouwe Andela <[email protected]>
  • Loading branch information
stefsmeets and bouweandela authored Dec 7, 2020
1 parent 1edded4 commit 114b8e0
Show file tree
Hide file tree
Showing 14 changed files with 276 additions and 14 deletions.
12 changes: 10 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ jobs:
- image: esmvalgroup/esmvalcore:development
steps:
- checkout
- restore_cache:
key: test-{{ .Branch }}
- run:
command: |
pip install .[test]
pytest -n 2 -m "not installation"
- save_cache:
key: test-{{ .Branch }}
paths:
- ".eggs"
- ".pytest_cache"
- store_test_results:
path: test-reports/
- store_artifacts:
Expand All @@ -44,7 +51,7 @@ jobs:
- checkout
- check_changes
- restore_cache:
key: python3-install-{{ .Branch }}
key: install-{{ .Branch }}
- run:
command: |
. /opt/conda/etc/profile.d/conda.sh
Expand All @@ -64,10 +71,11 @@ jobs:
pytest -n 2
esmvaltool version
- save_cache:
key: python3-install-{{ .Branch }}
key: install-{{ .Branch }}
paths:
- "/opt/conda/pkgs"
- ".eggs"
- ".pytest_cache"
- store_artifacts:
path: /logs
- store_artifacts:
Expand Down
11 changes: 11 additions & 0 deletions doc/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ adding ``-m 'not installation'`` to the previous command.
Tests will also be run automatically by
`CircleCI <https://circleci.com/gh/ESMValGroup/ESMValCore>`__.

Sample data
-----------

If you need sample data to work with, `this repository <https://github.com/ESMValGroup/ESMValTool_sample_data>`__ contains samples of real data for use with ESMValTool development, demonstration purposes and automated testing. The goal is to keep the repository size small (~ 100 MB), so it can be easily downloaded and distributed.

The data are installed as part of the developer dependencies, and used by some larger tests (i.e. in the `multimodel tests <https://github.com/ESMValGroup/ESMValCore/tree/master/tests/sample_data>`__)

The loading and preprocessing of the data can be somewhat time-consuming (~30 secs) and are cached by ``pytest`` to make the tests more performant.
Clear the cache by using running pytest with the ``--cache-clear`` flag. To avoid running these tests using sample data, use `pytest -m "not use_sample_data"`.
If you are adding new tests using sample data, please use the decorator ``@pytest.mark.use_sample_data``.

Code style
----------

Expand Down
27 changes: 15 additions & 12 deletions esmvalcore/preprocessor/_multimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def _compute_statistic(data, statistic_name):
quantile = float(statistic_name[1:]) / 100
statistic_function = partial(_quantile, quantile=quantile)
else:
raise NotImplementedError
raise ValueError(f'No such statistic: `{statistic_name}`')

# no plevs
if len(data[0].shape) < 3:
Expand Down Expand Up @@ -150,7 +150,8 @@ def _put_in_cube(template_cube, cube_data, statistic, t_axis):
tunits = cf_units.Unit(unit_name, calendar="standard")
times = iris.coords.DimCoord(t_axis,
standard_name='time',
units=tunits)
units=tunits,
var_name='time')

coord_names = [c.long_name for c in template_cube.coords()]
coord_names.extend([c.standard_name for c in template_cube.coords()])
Expand Down Expand Up @@ -357,8 +358,7 @@ def _assemble_full_data(cubes, statistic):


def multi_model_statistics(products, span, statistics, output_products=None):
"""
Compute multi-model statistics.
"""Compute multi-model statistics.
Multimodel statistics computed along the time axis. Can be
computed across a common overlap in time (set span: overlap)
Expand All @@ -383,22 +383,25 @@ def multi_model_statistics(products, span, statistics, output_products=None):
span; if full, statistics are computed on full time spans, ignoring
missing data.
output_products: dict
dictionary of output products.
statistics: str
statistical measure to be computed. Available options: mean, median,
max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional).
dictionary of output products. MUST be specified if products are NOT
cubes
statistics: list of str
list of statistical measure(s) to be computed. Available options:
mean, median, max, min, std, or pXX.YY (for percentile XX.YY; decimal
part optional).
Returns
-------
list
list of data products or cubes containing the multimodel stats
computed.
set or dict or list
`set` of data products if `output_products` is given
`dict` of cubes if `output_products` is not given
`list` of input cubes if there is no overlap between cubes when
using `span='overlap'`
Raises
------
ValueError
If span is neither overlap nor full.
"""
logger.debug('Multimodel statistics: computing: %s', statistics)
if len(products) < 2:
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ flake8-ignore =
log_level = WARNING
markers =
installation: test requires installation of dependencies
use_sample_data: Run functional tests using real data

[coverage:run]
parallel = true
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
'pyyaml',
'requests',
'scitools-iris>=2.2',
'requests',
'shapely[vectorized]',
'stratify',
'yamale==2.*',
Expand All @@ -57,6 +58,8 @@
'pytest-metadata>=1.5.1',
'pytest-mock',
'pytest-xdist',
('ESMValTool_sample_data @ '
'git+https://github.com/ESMValGroup/[email protected]'),
],
# Development dependencies
# Use pip install -e .[develop] to install in development mode
Expand Down
236 changes: 236 additions & 0 deletions tests/sample_data/multimodel_statistics/test_multimodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
"""Test using sample data for :func:`esmvalcore.preprocessor._multimodel`."""

import pickle
from itertools import groupby
from pathlib import Path

import iris
import numpy as np
import pytest

from esmvalcore.preprocessor import extract_time, multi_model_statistics

esmvaltool_sample_data = pytest.importorskip("esmvaltool_sample_data")

CALENDAR_PARAMS = (
pytest.param(
'360_day',
marks=pytest.mark.skip(
reason='Cannot calculate statistics with single cube in list')),
'365_day',
'gregorian',
'proleptic_gregorian',
pytest.param(
'julian',
marks=pytest.mark.skip(
reason='Cannot calculate statistics with single cube in list')),
)

SPAN_PARAMS = ('overlap', 'full')


def assert_array_almost_equal(this, other):
"""Assert that array `this` almost equals array `other`."""
if np.ma.isMaskedArray(this) or np.ma.isMaskedArray(other):
np.testing.assert_array_equal(this.mask, other.mask)

np.testing.assert_array_almost_equal(this, other)


def preprocess_data(cubes, time_slice: dict = None):
"""Regrid the data to the first cube and optional time-slicing."""
if time_slice:
cubes = [extract_time(cube, **time_slice) for cube in cubes]

first_cube = cubes[0]

# regrid to first cube
regrid_kwargs = {
'grid': first_cube,
'scheme': iris.analysis.Linear(),
}

cubes = [cube.regrid(**regrid_kwargs) for cube in cubes]

return cubes


@pytest.fixture(scope="module")
def timeseries_cubes_month(request):
"""Load representative timeseries data."""
# cache the cubes to save about 30-60 seconds on repeat use
data = request.config.cache.get("sample_data/monthly", None)

if data:
cubes = pickle.loads(data.encode('latin1'))
else:
time_slice = {
'start_year': 1985,
'end_year': 1987,
'start_month': 12,
'end_month': 2,
'start_day': 1,
'end_day': 1,
}
cubes = esmvaltool_sample_data.load_timeseries_cubes(mip_table='Amon')
cubes = preprocess_data(cubes, time_slice=time_slice)

# cubes are not serializable via json, so we must go via pickle
request.config.cache.set("sample_data/monthly",
pickle.dumps(cubes).decode('latin1'))

return cubes


@pytest.fixture(scope="module")
def timeseries_cubes_day(request):
"""Load representative timeseries data grouped by calendar."""
# cache the cubes to save about 30-60 seconds on repeat use
data = request.config.cache.get("sample_data/daily", None)

if data:
cubes = pickle.loads(data.encode('latin1'))

else:
time_slice = {
'start_year': 2001,
'end_year': 2002,
'start_month': 12,
'end_month': 2,
'start_day': 1,
'end_day': 1,
}
cubes = esmvaltool_sample_data.load_timeseries_cubes(mip_table='day')
cubes = preprocess_data(cubes, time_slice=time_slice)

# cubes are not serializable via json, so we must go via pickle
request.config.cache.set("sample_data/daily",
pickle.dumps(cubes).decode('latin1'))

def calendar(cube):
return cube.coord('time').units.calendar

# groupby requires sorted list
grouped = groupby(sorted(cubes, key=calendar), key=calendar)

cube_dict = {key: list(group) for key, group in grouped}

return cube_dict


def multimodel_test(cubes, span, statistic):
"""Run multimodel test with some simple checks."""
statistics = [statistic]

result = multi_model_statistics(cubes, span=span, statistics=statistics)
assert isinstance(result, dict)
assert statistic in result

return result


def multimodel_regression_test(cubes, span, name):
"""Run multimodel regression test.
This test will fail if the input data or multimodel code changed. To
update the data for the regression test, remove the corresponding
`.nc` files in this directory and re-run the tests. The tests will
fail the first time with a RuntimeError, because the reference data
are being written.
"""
statistic = 'mean'
result = multimodel_test(cubes, span=span, statistic=statistic)
result_cube = result[statistic]

filename = Path(__file__).with_name(f'{name}-{span}-{statistic}.nc')
if filename.exists():
reference_cube = iris.load_cube(str(filename))
assert_array_almost_equal(result_cube.data, reference_cube.data)

# Compare coords
for this_coord, other_coord in zip(result_cube.coords(),
reference_cube.coords()):
assert this_coord == other_coord

# remove Conventions which are added by Iris on save
reference_cube.attributes.pop('Conventions', None)

assert reference_cube.metadata == result_cube.metadata

else:
# The test will fail if no regression data are available.
iris.save(result_cube, filename)
raise RuntimeError(f'Wrote reference data to {filename.absolute()}')


@pytest.mark.use_sample_data
@pytest.mark.parametrize('span', SPAN_PARAMS)
def test_multimodel_regression_month(timeseries_cubes_month, span):
"""Test statistic."""
cubes = timeseries_cubes_month
name = 'timeseries_monthly'
multimodel_regression_test(
name=name,
span=span,
cubes=cubes,
)


@pytest.mark.use_sample_data
@pytest.mark.parametrize('calendar', CALENDAR_PARAMS)
@pytest.mark.parametrize('span', SPAN_PARAMS)
def test_multimodel_regression_day(timeseries_cubes_day, span, calendar):
"""Test statistic."""
cubes = timeseries_cubes_day[calendar]
name = f'timeseries_daily_{calendar}'
multimodel_regression_test(
name=name,
span=span,
cubes=cubes,
)


@pytest.mark.use_sample_data
def test_multimodel_no_vertical_dimension(timeseries_cubes_month):
"""Test statistic without vertical dimension using monthly data."""
span = 'full'
cubes = timeseries_cubes_month
cubes = [cube[:, 0] for cube in cubes]
multimodel_test(cubes, span=span, statistic='mean')


@pytest.mark.use_sample_data
@pytest.mark.xfail(
'iris.exceptions.CoordinateNotFoundError',
reason='https://github.com/ESMValGroup/ESMValCore/issues/891')
def test_multimodel_no_horizontal_dimension(timeseries_cubes_month):
"""Test statistic without horizontal dimension using monthly data."""
span = 'full'
cubes = timeseries_cubes_month
cubes = [cube[:, :, 0, 0] for cube in cubes]
# Coordinate not found error
# iris.exceptions.CoordinateNotFoundError:
# 'Expected to find exactly 1 depth coordinate, but found none.'
multimodel_test(cubes, span=span, statistic='mean')


@pytest.mark.use_sample_data
def test_multimodel_only_time_dimension(timeseries_cubes_month):
"""Test statistic without only the time dimension using monthly data."""
cubes = timeseries_cubes_month
span = 'full'
cubes = [cube[:, 0, 0, 0] for cube in cubes]
multimodel_test(cubes, span=span, statistic='mean')


@pytest.mark.use_sample_data
@pytest.mark.xfail(
'ValueError',
reason='https://github.com/ESMValGroup/ESMValCore/issues/890')
def test_multimodel_no_time_dimension(timeseries_cubes_month):
"""Test statistic without time dimension using monthly data."""
span = 'full'
cubes = timeseries_cubes_month
cubes = [cube[0] for cube in cubes]
# ValueError: Cannot guess bounds for a coordinate of length 1.
multimodel_test(cubes, span=span, statistic='mean')
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 114b8e0

Please sign in to comment.