Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow SZIP compression options #167

Merged
merged 2 commits into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 57 additions & 15 deletions src/hdmf/backends/hdf5/h5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import Iterable # Python 2.7
from six import binary_type, text_type
from h5py import Group, Dataset, RegionReference, Reference, special_dtype
from h5py import filters as h5py_filters
import json
import numpy as np
import warnings
Expand Down Expand Up @@ -233,7 +234,7 @@ class H5DataIO(DataIO):
'http://docs.h5py.org/en/latest/high/dataset.html#dataset-compression',
'default': None},
{'name': 'compression_opts',
'type': int,
'type': (int, tuple),
'doc': 'Parameter for compression filter',
'default': None},
{'name': 'fillvalue',
Expand Down Expand Up @@ -277,25 +278,66 @@ def __init__(self, **kwargs):
# Define the maxshape of the data if not provided by the user
if 'maxshape' not in self.__iosettings:
self.__iosettings['maxshape'] = self.data.maxshape
# Make default settings when compression set to bool (True/False)
if isinstance(self.__iosettings.get('compression', None), bool):
if self.__iosettings['compression']:
self.__iosettings['compression'] = 'gzip'
else:
self.__iosettings.pop('compression', None)
if 'compression_opts' in self.__iosettings:
warnings.warn('Compression disabled by compression=False setting. ' +
'compression_opts parameter will, therefore, be ignored.')
self.__iosettings.pop('compression_opts', None)
# Validate the compression options used
self._check_compression_options()
# Confirm that the compressor is supported by h5py
if not self.filter_available(self.__iosettings.get('compression', None)):
raise ValueError("%s compression not support by this version of h5py." %
str(self.__iosettings['compression']))
# Check possible parameter collisions
if isinstance(self.data, Dataset):
for k in self.__iosettings.keys():
warnings.warn("%s in H5DataIO will be ignored with H5DataIO.data being an HDF5 dataset" % k)

def _check_compression_options(self):
"""
Internal helper function used to check if compression options are compliant
with the compression filter used.

:raises ValueError: If incompatible options are detected
"""
if 'compression' in self.__iosettings:
if isinstance(self.__iosettings['compression'], bool):
if self.__iosettings['compression']:
self.__iosettings['compression'] = 'gzip'
else:
self.__iosettings.pop('compression', None)
if 'compression_opts' in self.__iosettings:
warnings.warn('Compression disabled by compression=False setting. ' +
'compression_opts parameter will, therefore, be ignored.')
self.__iosettings.pop('compression_opts', None)
if 'compression' in self.__iosettings:
if 'compression_opts' in self.__iosettings:
if self.__iosettings['compression'] == 'gzip':
if self.__iosettings['compression_opts'] not in range(10):
raise ValueError("GZIP compression_opts setting must be an integer from 0-9, " +
"not " + str(self.__iosettings['compression_opts']))
elif self.__iosettings['compression'] == 'lzf':
if self.__iosettings['compression_opts'] is not None:
raise ValueError("LZF compression filter accepts no compression_opts")
elif self.__iosettings['compression'] == 'szip':
if not isinstance(self.__iosettings['compression_opts'], tuple) or \
len(self.__iosettings['compression_opts']) != 2:
raise ValueError("SZIP compression filter compression_opts" +
" must be a 2-tuple ('ec'|'nn', even integer 0-32")
Comment on lines +319 to +322
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super minor, but regarding code style, I'm pretty sure the backslash and plus symbols are not necessary.

PEP8 says: "The preferred way of wrapping long lines is by using Python's implied line continuation inside parentheses, brackets and braces. Long lines can be broken over multiple lines by wrapping expressions in parentheses. These should be used in preference to using a backslash for line continuation."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need an ending parenthesis inside the last quote marker on line 322

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we check for gzip compression opts being an int from 0-9, can we also check for the szip compression opts being ('ec'|'nn', even integer 0-32)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what the issue with 322 is?

szip compression opts being ('ec'|'nn', even integer 0-32)?

sure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a syntax error - but within the string, the "(" before "'ec'" needs a corresponding ")".

# Warn if compressor other than gzip is being used
if self.__iosettings['compression'] != 'gzip':
warnings.warn(str(self.__iosettings['compression']) + " compression may not be available" +
"on all installations of HDF5. Use of gzip is recommended to ensure portability of" +
"the generated HDF5 files.")
# Check possible parameter collisions
if isinstance(self.data, Dataset):
for k in self.__iosettings.keys():
warnings.warn("%s in H5DataIO will be ignored with H5DataIO.data being an HDF5 dataset" % k)

@staticmethod
def filter_available(filter):
"""
Check if a given I/O filter is available
:param filter: String with the name of the filter, e.g., gzip, szip etc.
:type filter: String
:return: bool indicating wether the given filter is available
"""
if filter is not None:
return filter in h5py_filters.encode
else:
return True

@property
def link_data(self):
Expand Down
85 changes: 78 additions & 7 deletions tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from hdmf.spec.catalog import SpecCatalog
from hdmf.container import Container
from h5py import SoftLink, HardLink, ExternalLink, File
from h5py import filters as h5py_filters

from tests.unit.test_utils import Foo, FooBucket, CORE_NAMESPACE

Expand Down Expand Up @@ -136,7 +137,7 @@ def test_write_dataset_list(self):
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a))

def test_write_dataset_list_compress(self):
def test_write_dataset_list_compress_gzip(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
compression='gzip',
compression_opts=5,
Expand All @@ -150,6 +151,35 @@ def test_write_dataset_list_compress(self):
self.assertEqual(dset.shuffle, True)
self.assertEqual(dset.fletcher32, True)

@unittest.skipIf("lzf" not in h5py_filters.encode,
"LZF compression not supported in this h5py library install")
def test_write_dataset_list_compress_lzf(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
compression='lzf',
shuffle=True,
fletcher32=True)
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={}))
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a.data))
self.assertEqual(dset.compression, 'lzf')
self.assertEqual(dset.shuffle, True)
self.assertEqual(dset.fletcher32, True)

@unittest.skipIf("szip" not in h5py_filters.encode,
"SZIP compression not supported in this h5py library install")
def test_write_dataset_list_compress_szip(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
compression='szip',
compression_opts=('ec', 16),
shuffle=True,
fletcher32=True)
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={}))
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a.data))
self.assertEqual(dset.compression, 'lzf')
oruebel marked this conversation as resolved.
Show resolved Hide resolved
self.assertEqual(dset.shuffle, True)
self.assertEqual(dset.fletcher32, True)

def test_write_dataset_list_enable_default_compress(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
compression=True)
Expand Down Expand Up @@ -411,19 +441,60 @@ def test_warning_on_non_gzip_compression(self):
compression='gzip')
self.assertEqual(len(w), 0)
self.assertEqual(dset.io_settings['compression'], 'gzip')
# Make sure no warning is issued when using szip
with warnings.catch_warnings(record=True) as w:
dset = H5DataIO(np.arange(30),
compression='szip')
self.assertEqual(len(w), 1)
self.assertEqual(dset.io_settings['compression'], 'szip')
# Make sure no warning is issued when using szip (if installed)
if "szip" in h5py_filters.encode:
with warnings.catch_warnings(record=True) as w:
dset = H5DataIO(np.arange(30),
compression='szip',
compression_opts=('ec', 16))
self.assertEqual(len(w), 1)
self.assertEqual(dset.io_settings['compression'], 'szip')
else:
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='szip', compression_opts=('ec', 16))
# Make sure no warning is issued when using lzf
with warnings.catch_warnings(record=True) as w:
dset = H5DataIO(np.arange(30),
compression='lzf')
self.assertEqual(len(w), 1)
self.assertEqual(dset.io_settings['compression'], 'lzf')

def test_error_on_unsupported_compression_filter(self):
# Make sure gzip does not raise an error
try:
H5DataIO(np.arange(30), compression='gzip', compression_opts=5)
except ValueError:
self.fail("Using gzip compression raised a ValueError when it should not")
# Make sure szip raises an error if not installed (or does not raise an error if installed)
if "szip" not in h5py_filters.encode:
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='szip', compression_opts=('ec', 16))
else:
try:
H5DataIO(np.arange(30), compression='szip', compression_opts=('ec', 16))
except ValueError:
self.fail("SZIP is installed but H5DataIO still raises an error")
# Test error on illegal (i.e., a made-up compressor)
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression="my_compressor_that_h5py_doesnt_know")

def test_value_error_on_incompatible_compression_opts(self):
# Make sure we warn when gzip with szip compression options is used
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='gzip', compression_opts=('ec', 16))
# Make sure we warn if gzip with a too high agression is used
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='gzip', compression_opts=100)
# Make sure we warn if lzf with gzip compression option is used
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='lzf', compression_opts=5)
# Make sure we warn if lzf with szip compression option is used
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='lzf', compression_opts=('ec', 16))
# Make sure we warn if szip with gzip compression option is used
with self.assertRaises(ValueError):
H5DataIO(np.arange(30), compression='szip', compression_opts=4)

def test_warning_on_linking_of_regular_array(self):
with warnings.catch_warnings(record=True) as w:
dset = H5DataIO(np.arange(30),
Expand Down