-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Start building out rtofs routines * Start adding rtofs to ingest workflow * Workign ingest lambda with rtofs * Add RTOFS aggregation to ingest tools * Working rtofs aggregation workflow
- Loading branch information
1 parent
c4ef203
commit 99c10cb
Showing
8 changed files
with
243 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import fsspec | ||
import ujson | ||
from kerchunk.hdf import SingleHdf5ToZarr | ||
|
||
|
||
def generate_kerchunked_nc(bucket: str, key: str, dest_key: str, dest_bucket: str, dest_prefix: str): | ||
''' | ||
Generate a kerchunked zarr file from a netcdf file in s3 | ||
''' | ||
if not key.endswith('.nc'): | ||
print(f'File {key} does not have a netcdf file postfix. Skipping...') | ||
return | ||
|
||
# For now SSL false is solving my cert issues **shrug** | ||
fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True, use_ssl=False) | ||
fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True, use_ssl=False) | ||
|
||
url = f"s3://{bucket}/{key}" | ||
outurl = f"s3://{dest_bucket}/{dest_prefix}/{dest_key}" | ||
|
||
with fs_read.open(url) as ifile: | ||
print(f"Kerchunking netcdf at {url}") | ||
try: | ||
chunks = SingleHdf5ToZarr(ifile, url) | ||
except Exception as e: | ||
print(f'Failed to kerchunk {url}: {e}') | ||
return | ||
|
||
print(f"Writing kerchunked json to {outurl}") | ||
with fs_write.open(outurl, mode="w") as ofile: | ||
data = ujson.dumps(chunks.translate()) | ||
ofile.write(data) | ||
|
||
print(f'Successfully processed {url}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import re | ||
import datetime | ||
from typing import Tuple | ||
|
||
import fsspec | ||
import ujson | ||
from kerchunk.combine import MultiZarrToZarr | ||
|
||
from .generic import generate_kerchunked_nc | ||
|
||
|
||
def generate_rtofs_output_key(key: str) -> str: | ||
''' | ||
Generate the output file key for a given input key and destination bucket and prefix: | ||
'rtofs.20230922/rtofs_glo_2ds_f001_diag.nc' | ||
The following output key will be generated: rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
''' | ||
components = key.split('/') | ||
model_date = components[-2] | ||
filename = components[-1] | ||
return f'{model_date}.{filename}.zarr' | ||
|
||
|
||
def generate_kerchunked_rtofs_nc(region: str, bucket: str, key: str, dest_bucket: str, dest_prefix: str): | ||
''' | ||
Generate a kerchunked zarr file from a netcdf file in s3 | ||
''' | ||
filekey = generate_rtofs_output_key(key) | ||
generate_kerchunked_nc(bucket, key, filekey, dest_bucket, dest_prefix) | ||
|
||
|
||
def generate_rtofs_best_time_series_glob_expression(key: str) -> str: | ||
''' | ||
Parse the glob prefix and postfix given the zarr single file key: | ||
'rtofs/rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
The following expression will be created: rtofs/rtofs.*.rtofs_glo_2ds_f*_diag.nc.zarr' | ||
''' | ||
prefix, inner, postfix = re.search(r'(.*).\d{8}.(.*)_f\d{3}_(.*)', key).groups() | ||
return f'{prefix}.*.{inner}_f*_{postfix}' | ||
|
||
|
||
def parse_rtofs_model_run_datestamp_offset(key: str) -> Tuple[str, int]: | ||
''' | ||
Parse the model run forecast time key from the key of the file in the RTOFS S3 bucket, given the RTOFS naming convention: | ||
'rtofs/rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
where the model_date is 20230922 and the offset is 1, this would result in a key of 20230922T01 | ||
''' | ||
model_date, offset = re.search(r'(\d{8}).*f(\d{3})', key).groups() | ||
model_date = datetime.datetime.strptime(f'{model_date}T00', '%Y%m%dT%H') + datetime.timedelta(hours=int(offset)) | ||
model_date_key = model_date.strftime('%Y%m%dT%H') | ||
return model_date_key, int(offset) | ||
|
||
|
||
def generate_rtofs_best_timeseries_key(best_timeseries_glob: str) -> str: | ||
''' | ||
Create the best time series key for a given glob expression: | ||
'rtofs/rtofs.*.rtofs_glo_2ds_f*_diag.nc.zarr' | ||
The following key will be generated: rtofs/rtofs.rtofs_glo_2ds_diag.best.nc.zarr' | ||
''' | ||
return best_timeseries_glob.replace('.*', '').replace('_f*', '').replace('.nc.zarr', '.best.nc.zarr') | ||
|
||
|
||
def generate_kerchunked_rtofs_best_time_series(region: str, bucket: str, key: str): | ||
''' | ||
Generate or update the best time series kerchunked aggregation for the model. If the specified file is not in the best time series, | ||
then the best time series aggregation will not be updated | ||
''' | ||
print(f'Generating best time series multizarr aggregation for key: {key}') | ||
|
||
try: | ||
best_time_series_glob = generate_rtofs_best_time_series_glob_expression(key) | ||
except Exception as e: | ||
print(f'Failed to parse model run date and hour from key {key}: {e}. Skipping...') | ||
return | ||
|
||
# For now SSL false is solving my cert issues **shrug** | ||
fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True, use_ssl=False) | ||
fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True, use_ssl=False) | ||
|
||
model_files = fs_read.glob(f's3://{bucket}/{best_time_series_glob}') | ||
model_files = sorted(['s3://'+f for f in model_files]) | ||
|
||
indexes = {} | ||
|
||
for f in model_files: | ||
model_date_key, offset = parse_rtofs_model_run_datestamp_offset(f) | ||
if model_date_key not in indexes: | ||
indexes[model_date_key] = [offset, f] | ||
else: | ||
if offset < indexes[model_date_key][0]: | ||
indexes[model_date_key] = [offset, f] | ||
|
||
model_best_files = [x[1] for x in list(indexes.values())] | ||
|
||
target_key = f's3://{bucket}/{key}' | ||
if target_key not in model_best_files: | ||
print(f'{key} is not a part of the current best time series for its model. Skipping...') | ||
return | ||
|
||
model_run_file_count = len(model_best_files) | ||
print(f'Aggregating {model_run_file_count} model files for best time series aggregation...') | ||
|
||
# TODO: Generalize this somehow? | ||
mzz = MultiZarrToZarr( | ||
model_best_files, | ||
remote_protocol='s3', | ||
remote_options={'anon': True, 'use_ssl': False}, | ||
concat_dims=['MT'], | ||
identical_dims=['Y', 'X', 'Latitude', 'Longitude'] | ||
) | ||
|
||
d = mzz.translate() | ||
|
||
outkey = generate_rtofs_best_timeseries_key(best_time_series_glob) | ||
outurl = f's3://{bucket}/{outkey}' | ||
|
||
print(f'Writing zarr best time series aggregation to {outurl}') | ||
with fs_write.open(outurl, 'w') as ofile: | ||
ofile.write(ujson.dumps(d)) | ||
|
||
print(f'Successfully updated {outurl} RTOFS best time series aggregation') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from ingest_tools.rtofs import generate_rtofs_output_key, generate_rtofs_best_time_series_glob_expression, parse_rtofs_model_run_datestamp_offset, generate_rtofs_best_timeseries_key | ||
|
||
|
||
def test_generate_rtofs_output_key(): | ||
key = 'rtofs.20230922/rtofs_glo_2ds_f001_diag.nc' | ||
output_key = generate_rtofs_output_key(key) | ||
assert output_key == 'rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
|
||
|
||
def test_generate_rtofs_best_time_series_glob_expression(): | ||
key = 'rtofs/rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
glob_expression = generate_rtofs_best_time_series_glob_expression(key) | ||
assert glob_expression == 'rtofs/rtofs.*.rtofs_glo_2ds_f*_diag.nc.zarr' | ||
|
||
key = 'rtofs/rtofs.20230925.rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc.zarr' | ||
glob_expression = generate_rtofs_best_time_series_glob_expression(key) | ||
assert glob_expression == 'rtofs/rtofs.*.rtofs_glo_3dz_f*_6hrly_hvr_US_east.nc.zarr' | ||
|
||
|
||
def test_parse_rtofs_model_run_datestamp_offset(): | ||
key = 'rtofs/rtofs.20230922.rtofs_glo_2ds_f001_diag.nc.zarr' | ||
model_date, offset = parse_rtofs_model_run_datestamp_offset(key) | ||
assert model_date == '20230922T01' | ||
assert offset == 1 | ||
|
||
key = 'rtofs/rtofs.20230925.rtofs_glo_3dz_f006_6hrly_hvr_US_east.nc.zarr' | ||
model_date, offset = parse_rtofs_model_run_datestamp_offset(key) | ||
assert model_date == '20230925T06' | ||
assert offset == 6 | ||
|
||
|
||
def test_generate_best_timeseries_key(): | ||
glob = 'rtofs/rtofs.*.rtofs_glo_2ds_f*_diag.nc.zarr' | ||
best_timeseries_key = generate_rtofs_best_timeseries_key(glob) | ||
assert best_timeseries_key == 'rtofs/rtofs.rtofs_glo_2ds_diag.best.nc.zarr' | ||
|
||
glob = 'rtofs/rtofs.*.rtofs_glo_3dz_f*_6hrly_hvr_US_east.nc.zarr' | ||
best_timeseries_key = generate_rtofs_best_timeseries_key(glob) | ||
assert best_timeseries_key == 'rtofs/rtofs.rtofs_glo_3dz_6hrly_hvr_US_east.best.nc.zarr' |