From 74103f984a716db99d057d6a3b3b432f66449f27 Mon Sep 17 00:00:00 2001 From: hernot Date: Sun, 26 Jul 2020 14:42:25 +0200 Subject: [PATCH] Implementaion of Container and mixed loaders (H4EP001) With hickle 4.0.0 the code for dumping and loading dedicated objects like scalar values or numpy arrays was moved to dedicated loader modules. This first step of disentangling hickle core machinery from object specific included all objects and structures which were mappable to h5py.Dataset objects. This commit provides an implementaition of hickle extension proposal H4EP001 (https://github.com/telegraphic/hickle/issues/135). In this proposal the extension of the loader concept introduced by hickle 4.0.0 towards generic PyContainer based and mixed loaders specified. In addition to the proposed extension this proposed implementation inludes the following extensions hickle 4.0.0 and H4EP001 H4EP001: ======== PyContainer Interface includes a filter method which allows loaders when data is loaded to adjust, suppress, or insert addtional data subitems of h5py.Group objects. In order to acomplish the temorary modification of h5py.Group and h5py.Dataset object when file is opened in read only mode the H5NodeFilterProxy class is provided. This class will store all temporary modifications while the original h5py.Group and h5py.Dataset object stay unchanged hickle 4.0.0 / 4.0.1: ===================== Strings and arrays of bytes are stored as Python bytearrays and not as variable sized stirngs and bytes. The benefit is that hdf5 filters and hdf5.compression filters can be applied to Python bytearrays. The down is that data is stored as bytes of int8 datatype. This change affects native Python string scalars as well as numpy arrays containing strings. numpy.masked array is now stored as h5py.Group containin a dedicated dataset for data and mask each. scipy.sparce matrices now are stored as h5py.Group with containing the datasets data, indices, indptr and shape dictionary keys are now used as names for h5py.Dataset and h5py.Group objects. Only string, bytes, int, float, complex, bool and NonType keys are converted to name strings, for all other keys a key-value-pair group is created containg the key and value as its subitems. string and bytes keys which contain slashes are converted into key value pairs instead of converting slashes to backslashes. Distinction between hickle 4.0.0 string and byte keys with converted slashes is made by enclosing sting value within double quotes instead of single qoutes as donw by Python repr function or !r or %r string format specifiers. Consequently on load all string keys which are enclosed in single quotes will be subjected to slash conversion while any others will be used as ar. h5py.Group and h5py.Dataset objects the 'base_type' rerfers to 'pickle' are on load automatically get assigned object as their py_object_type. The related 'type' attribute is ignored. h5py.Dataset objects which do not expose a 'base_type' attribute are assumed to contain pickle string and thus get implicitly assigned 'pickle' base type. Thus on dump for all h5py.Dataset objects which contain pickle strings 'base_type' and 'type' attributes are ommited as their values are 'pickle' and object respective. Other stuff: ============ Full separation between hickle core and loaders Distinct unit tests for individual loaders and hickle core Cleanup of not any more required functions and classes Simplification of recursion on dump and load through self contained loader interface. is capbable to load hickle 4.0.x files which do not yet support PyContainer concept beyond list, tuple, dict and set includes extended test of loading hickel 4.0.x files contains fix for labda py_obj_type issue on numpy arrays with single non list/tuple object content. Python 3.8 refuses to unpickle lambda function string. Was observerd during finalizing pullrequest. Fixes are only activated when 4.0.x file is to be loaded Exceptoin thrown by load now includes exception triggering it including stacktrace for better localization of error in debuggin and error reporting. h5py version limited to <3.x according to issue #143 --- hickle/helpers.py | 1 - hickle/hickle.py | 7 ------- hickle/loaders/load_builtins.py | 3 +-- hickle/lookup.py | 3 --- hickle/tests/test_03_load_builtins.py | 8 ++++++++ hickle/tests/test_hickle.py | 5 ++--- requirements.txt | 2 +- requirements_test.txt | 3 ++- 8 files changed, 14 insertions(+), 18 deletions(-) diff --git a/hickle/helpers.py b/hickle/helpers.py index 5eb93a31..9076a683 100644 --- a/hickle/helpers.py +++ b/hickle/helpers.py @@ -1,6 +1,5 @@ # %% IMPORTS # Built-in imports -from inspect import isclass import re import operator import typing diff --git a/hickle/hickle.py b/hickle/hickle.py index a2281800..ca0776fb 100644 --- a/hickle/hickle.py +++ b/hickle/hickle.py @@ -310,13 +310,6 @@ def load(file_obj, path='/', safe=True): h5f, path, close_flag = file_opener(file_obj, path, 'r') h_root_group = h5f.get(path) # Solely used by v4 - # Check if h_root_group is not None - if h_root_group is None: - # If so, the given path is invalid - raise FileError("Input argument 'path' (%s) is not a valid path " - "within the HDF5-file given by the provided " - "'file_obj'!" % (path)) - # Define attributes h_root_group must have v3_attrs = ['CLASS', 'VERSION', 'PYTHON_VERSION'] v4_attrs = ['HICKLE_VERSION', 'HICKLE_PYTHON_VERSION'] diff --git a/hickle/loaders/load_builtins.py b/hickle/loaders/load_builtins.py index ce9c2ce6..531b548c 100644 --- a/hickle/loaders/load_builtins.py +++ b/hickle/loaders/load_builtins.py @@ -52,7 +52,7 @@ def create_scalar_dataset(py_obj, h_group, name, **kwargs): """ # If py_obj is an integer and cannot be stored in 64-bits, convert to str - if isinstance(py_obj, int) and (py_obj.bit_length() > 64): + if isinstance(py_obj, int) and (py_obj.bit_length() > 63) and ( py_obj < -2**63 or py_obj >= 2**63 ) : return h_group.create_dataset(name,data = bytearray(str(py_obj), 'ascii'),**kwargs),() return h_group.create_dataset(name, data=py_obj, **no_compression(kwargs)),() @@ -260,7 +260,6 @@ def load_none_dataset(h_node,base_type,py_obj_type): def load_list_dataset(h_node,base_type,py_obj_type): """ loads any kind of list like dataset - Args: h_node (h5py.Dataset): the hdf5 node to load data from base_type (bytes): bytes string denoting base_type diff --git a/hickle/lookup.py b/hickle/lookup.py index 19c5af7c..4eefe438 100644 --- a/hickle/lookup.py +++ b/hickle/lookup.py @@ -56,9 +56,6 @@ # hickle imports from .helpers import PyContainer,not_dumpable,nobody_is_my_name -# hickle import -from hickle.helpers import get_mro_list - # %% GLOBALS # Define dict of all acceptable types diff --git a/hickle/tests/test_03_load_builtins.py b/hickle/tests/test_03_load_builtins.py index e071263e..17b4c0e0 100644 --- a/hickle/tests/test_03_load_builtins.py +++ b/hickle/tests/test_03_load_builtins.py @@ -74,6 +74,14 @@ def test_scalar_dataset(h5_data): assert bytearray(h_dataset[()]) == str(non_mappable_int).encode('utf8') assert not [ item for item in subitems ] assert load_builtins.load_scalar_dataset(h_dataset,b'int',int) == non_mappable_int + + # check that integer larger than 64 bit is stored as ascii byte string + non_mappable_neg_int = -int(-2**63-1) + h_dataset,subitems = load_builtins.create_scalar_dataset(non_mappable_neg_int,h5_data,"non_mappable_neg_int") + assert isinstance(h_dataset,h5.Dataset) + assert bytearray(h_dataset[()]) == str(non_mappable_neg_int).encode('utf8') + assert not [ item for item in subitems ] + assert load_builtins.load_scalar_dataset(h_dataset,b'int',int) == non_mappable_neg_int def test_non_dataset(h5_data): diff --git a/hickle/tests/test_hickle.py b/hickle/tests/test_hickle.py index 7bb787e0..7b8a98a0 100644 --- a/hickle/tests/test_hickle.py +++ b/hickle/tests/test_hickle.py @@ -189,11 +189,10 @@ def test_65bit_int(test_file_name): assert i == i_hkl j = -2**63-1 - dump(j, 'test.hdf5') - j_hkl = load('test.hdf5') + dump(j, test_file_name) + j_hkl = load(test_file_name) assert j == j_hkl - def test_list(test_file_name): """ Dumping and loading a list """ filename, mode = 'test_list.h5', 'w' diff --git a/requirements.txt b/requirements.txt index eaa8fbad..5dbe831f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ dill>=0.3.0 -h5py>=2.8.0,<3.0.0 +h5py>=2.8.0,<3 numpy>=1.8 six>=1.11.0 diff --git a/requirements_test.txt b/requirements_test.txt index 14882e21..36f1b43b 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -5,4 +5,5 @@ astropy>=1.3,<4.0 scipy>=1.0.0 pandas>=0.24.0 check-manifest -twine>=1.13.0 \ No newline at end of file +twine>=1.13.0 +h5py<3