diff --git a/versioned_hdf5/slicetools.pyx b/versioned_hdf5/slicetools.pyx index 496e1354..5919a99c 100644 --- a/versioned_hdf5/slicetools.pyx +++ b/versioned_hdf5/slicetools.pyx @@ -11,14 +11,6 @@ from libc.stdlib cimport malloc, free from libc.string cimport strlen, strncmp from libcpp.vector cimport vector -ctypedef enum H5S_sel_type: - H5S_SEL_ERROR = -1, #Error - H5S_SEL_NONE = 0, #Nothing selected - H5S_SEL_POINTS = 1, #Sequence of points selected - H5S_SEL_HYPERSLABS = 2, #"New-style" hyperslab selection defined - H5S_SEL_ALL = 3, #Entire extent selected - H5S_SEL_N = 4 #/*THIS MUST BE LAST - cdef extern from "hdf5.h": # HDF5 types ctypedef long int hid_t @@ -27,20 +19,23 @@ cdef extern from "hdf5.h": ctypedef long long hsize_t # virtual Dataset functions - cdef herr_t H5Pget_virtual_count(hid_t dcpl_id, size_t *count) except -1 - cdef ssize_t H5Pget_virtual_dsetname(hid_t dcpl_id, size_t index, char *name, size_t size) except -1 - cdef ssize_t H5Pget_virtual_filename(hid_t dcpl_id, size_t index, char *name, size_t size) except -1 - cdef hid_t H5Pget_virtual_vspace(hid_t dcpl_id, size_t index) except -1 - cdef hid_t H5Pget_virtual_srcspace(hid_t dcpl_id, size_t index) except -1 + cdef ssize_t H5Pget_virtual_dsetname(hid_t dcpl_id, size_t index, char *name, size_t size) + cdef ssize_t H5Pget_virtual_filename(hid_t dcpl_id, size_t index, char *name, size_t size) + cdef hid_t H5Pget_virtual_vspace(hid_t dcpl_id, size_t index) + cdef hid_t H5Pget_virtual_srcspace(hid_t dcpl_id, size_t index) + + ctypedef enum H5S_sel_type: + H5S_SEL_ERROR = -1, #Error + H5S_SEL_NONE = 0, #Nothing selected + H5S_SEL_POINTS = 1, #Sequence of points selected + H5S_SEL_HYPERSLABS = 2, #"New-style" hyperslab selection defined + H5S_SEL_ALL = 3, #Entire extent selected + H5S_SEL_N = 4 #/*THIS MUST BE LAST - # TODO: this function actually returns an H5S_sel_type enum, but compilation fails - # when that's specified and we have an "except" clause. This looks like a bug in Cython? - # https://github.com/cython/cython/issues/6275 - # cdef H5S_sel_type H5Sget_select_type(hid_t space_id) except -1 - cdef int H5Sget_select_type(hid_t space_id) except -1 + cdef H5S_sel_type H5Sget_select_type(hid_t space_id) except H5S_sel_type.H5S_SEL_ERROR - cdef int H5Sget_simple_extent_ndims(hid_t space_id) except -1 - cdef htri_t H5Sget_regular_hyperslab(hid_t spaceid, hsize_t* start, hsize_t* stride, hsize_t* count, hsize_t* block) except -1 + cdef int H5Sget_simple_extent_ndims(hid_t space_id) + cdef htri_t H5Sget_regular_hyperslab(hid_t spaceid, hsize_t* start, hsize_t* stride, hsize_t* count, hsize_t* block) def spaceid_to_slice(space) -> Tuple: """ @@ -77,7 +72,11 @@ def hyperslab_to_slice(start, stride, count, block): cdef _spaceid_to_slice(space_id: hid_t): - sel_type: H5S_sel_type = H5Sget_select_type(space_id) + """ + Helper function to read the data for `space_id` selection and + convert it to a Tuple of slices. + """ + sel_type: H5S_sel_type = H5Sget_select_type(space_id) if sel_type == H5S_sel_type.H5S_SEL_ALL: return Tuple() @@ -85,13 +84,18 @@ cdef _spaceid_to_slice(space_id: hid_t): slices: list = [] rank: cython.int = H5Sget_simple_extent_ndims(space_id) + if rank < 0: + raise ValueError('Cannot determine rank of selection.') start_array: vector[hsize_t] = vector[hsize_t](rank) stride_array: vector[hsize_t] = vector[hsize_t](rank) count_array: vector[hsize_t] = vector[hsize_t](rank) block_array: vector[hsize_t] = vector[hsize_t](rank) - H5Sget_regular_hyperslab(space_id, start_array.data(), stride_array.data(), - count_array.data(), block_array.data()) + ret: htri_t = H5Sget_regular_hyperslab(space_id, start_array.data(), stride_array.data(), + count_array.data(), block_array.data()) + if ret < 0: + raise ValueError('Cannot determine hyperslab selection.') + i: cython.int start: hsize_t end: hsize_t @@ -117,65 +121,69 @@ cdef _spaceid_to_slice(space_id: hid_t): else: raise NotImplementedError("Point selections are not yet supported") -cpdef build_data_dict(dcpl, shape: tuple, chunks: tuple, raw_data_name: str): +cpdef build_data_dict(dcpl, raw_data_name: str): + """ + Function to build the "data_dict" of a versioned virtual dataset. + + All virtual datasets created by versioned-hdf5 should have chunks in + exactly one raw dataset `raw_data_name` in the same file. This function will + check that this is the case and return a dictionary mapping the `Tuple` of + the chunk in the virtual dataset to a `Slice` in the raw dataset. + + :param dcpl: the dataset creation property list of the versioned dataset + :param raw_data_name: the name of the corresponding raw dataset + :return: a dictionary mapping the `Tuple` of the virtual dataset chunk + to a `Slice` in the raw dataset. + """ data_dict: dict = {} - is_virtual: bool = dcpl.get_layout() == h5d.VIRTUAL - - if not is_virtual: - # A dataset created with only a fillvalue will be nonvirtual, - # since create_virtual_dataset makes a nonvirtual dataset when - # there are no virtual sources. - data_dict = {} - # Same as dataset.get_virtual_sources - elif 0 in shape: - # Work around https://github.com/h5py/h5py/issues/1660 - empty_idx = Tuple().expand(shape) - data_dict = {empty_idx: Slice()} - else: - data_dict = {} + with phil: + dcpl_id: hid_t = dcpl.id - with phil: - dcpl_id: hid_t = dcpl.id + virtual_count: size_t = dcpl.get_virtual_count() + j: size_t - virtual_count: size_t = dcpl.get_virtual_count() - j: size_t + raw_data_name_bytes: bytes = raw_data_name.encode('utf8') + # this a reference to the internal buffer of raw_data_name, do not free! + raw_data_str: cython.p_char = raw_data_name_bytes - raw_data_name_bytes: bytes = raw_data_name.encode('utf8') - # this a reference to the internal buffer of raw_data_name, do not free! - raw_data_str: cython.p_char = raw_data_name_bytes + filename_buf_len: ssize_t = 2 + filename_buf: cython.p_char = malloc(filename_buf_len) + if not filename_buf: + raise MemoryError('could not allocate filename_buf') - filename_buf_len: ssize_t = 2 - filename_buf: cython.p_char = malloc(filename_buf_len) - if not filename_buf: - raise MemoryError('could not allocate filename_buf') + try: + dataset_buf_len: ssize_t = strlen(raw_data_str) + 1 + dataset_buf: cython.p_char = malloc(dataset_buf_len) + if not dataset_buf: + raise MemoryError('could not allocate dataset_buf') try: - dataset_buf_len: ssize_t = strlen(raw_data_str) + 1 - dataset_buf: cython.p_char = malloc(dataset_buf_len) - if not dataset_buf: - raise MemoryError('could not allocate dataset_buf') - - try: - for j in range(virtual_count): - H5Pget_virtual_filename(dcpl_id, j, filename_buf, filename_buf_len) - if strncmp(filename_buf, ".", filename_buf_len) != 0: - raise ValueError('Virtual dataset filename mismatch, expected "."') - - H5Pget_virtual_dsetname(dcpl_id, j, dataset_buf, dataset_buf_len) - if strncmp(dataset_buf, raw_data_str, dataset_buf_len) != 0: - raise ValueError(f'Virtual dataset name mismatch, expected {raw_data_name}') - - vspace_id: hid_t = H5Pget_virtual_vspace(dcpl_id, j) - srcspace_id: hid_t = H5Pget_virtual_srcspace(dcpl_id, j) - - vspace_slice_tuple = _spaceid_to_slice(vspace_id) - srcspace_slice_tuple = _spaceid_to_slice(srcspace_id) - # the slice into the raw_data (srcspace_slice_tuple) is only on the first axis - data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0] - finally: - free(dataset_buf) + for j in range(virtual_count): + if H5Pget_virtual_filename(dcpl_id, j, filename_buf, filename_buf_len) < 0: + raise ValueError('Could not get virtual filename') + if strncmp(filename_buf, ".", filename_buf_len) != 0: + raise ValueError('Virtual dataset filename mismatch, expected "."') + + if H5Pget_virtual_dsetname(dcpl_id, j, dataset_buf, dataset_buf_len) < 0: + raise ValueError('Could not get virtual dsetname') + if strncmp(dataset_buf, raw_data_str, dataset_buf_len) != 0: + raise ValueError(f'Virtual dataset name mismatch, expected {raw_data_name}') + + vspace_id: hid_t = H5Pget_virtual_vspace(dcpl_id, j) + if vspace_id == -1: + raise ValueError('Could not get vspace_id') + srcspace_id: hid_t = H5Pget_virtual_srcspace(dcpl_id, j) + if srcspace_id == -1: + raise ValueError('Could not get srcspace_id') + + vspace_slice_tuple = _spaceid_to_slice(vspace_id) + srcspace_slice_tuple = _spaceid_to_slice(srcspace_id) + # the slice into the raw_data (srcspace_slice_tuple) is only on the first axis + data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0] finally: - free(filename_buf) + free(dataset_buf) + finally: + free(filename_buf) return data_dict \ No newline at end of file diff --git a/versioned_hdf5/wrappers.py b/versioned_hdf5/wrappers.py index 35c4e194..27b1edac 100644 --- a/versioned_hdf5/wrappers.py +++ b/versioned_hdf5/wrappers.py @@ -1429,7 +1429,21 @@ def __init__(self, _id): def data_dict(self): if self._data_dict is None: dcpl = self.get_create_plist() - self._data_dict = build_data_dict(dcpl, self._shape, self.chunks, self.raw_data.name) + + is_virtual: bool = dcpl.get_layout() == h5d.VIRTUAL + + if not is_virtual: + # A dataset created with only a fillvalue will be nonvirtual, + # since create_virtual_dataset makes a nonvirtual dataset when + # there are no virtual sources. + self._data_dict = {} + # Same as dataset.get_virtual_sources + elif 0 in self.shape: + # Work around https://github.com/h5py/h5py/issues/1660 + empty_idx = Tuple().expand(self.shape) + self._data_dict = {empty_idx: Slice()} + else: + self._data_dict = build_data_dict(dcpl, self.raw_data.name) return self._data_dict