Skip to content

Commit

Permalink
Break build_data_dict out into separate python implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
peytondmurray committed Oct 15, 2024
1 parent 8c6fa28 commit 1ebee1d
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 47 deletions.
33 changes: 33 additions & 0 deletions versioned_hdf5/data_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from h5py import h5p
from h5py._hl.base import phil

from .slicetools import _spaceid_to_slice


def build_data_dict(dcpl: h5p.PropDCID, raw_data_name: str):
"""Build the data_dict of a versioned virtual dataset.
All virtual datasets created by versioned-hdf5 should have chunks in
exactly one raw dataset `raw_data_name` in the same file.
This function blindly assumes this is the case.
:param dcpl: the dataset creation property list of the versioned dataset
:param raw_data_name: the name of the corresponding raw dataset
:return: a dictionary mapping the `Tuple` of the virtual dataset chunk
to a `Slice` in the raw dataset.
"""
data_dict: dict = {}

with phil:
for j in range(dcpl.get_virtual_count()):
vspace = dcpl.get_virtual_vspace(j)
srcspace = dcpl.get_virtual_srcspace(j)

vspace_slice_tuple = _spaceid_to_slice(vspace.id)
srcspace_slice_tuple = _spaceid_to_slice(srcspace.id)

# the slice into the raw_data (srcspace_slice_tuple) is only
# on the first axis
data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]

return data_dict
1 change: 1 addition & 0 deletions versioned_hdf5/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ py.install_sources(
'replay.py',
'versions.py',
'wrappers.py',
'data_dict.py',
],
subdir: 'versioned_hdf5',
)
Expand Down
93 changes: 47 additions & 46 deletions versioned_hdf5/slicetools.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cython: linetrace=True
import sys
from functools import lru_cache

Expand Down Expand Up @@ -113,7 +114,7 @@ np_haddr_t = np.longlong
NP_GE_200 = np.lib.NumpyVersion(np.__version__) >= "2.0.0"


def spaceid_to_slice(space) -> Tuple:
def spaceid_to_slice(space: h5s.SpaceID) -> Tuple:
"""
Convert an h5py spaceid object into an ndindex index

Expand Down Expand Up @@ -146,7 +147,7 @@ def hyperslab_to_slice(start, stride, count, block):


@cython.infer_types(True)
cdef _spaceid_to_slice(space_id: hid_t):
cpdef _spaceid_to_slice(space_id: hid_t):
"""
Helper function to read the data for `space_id` selection and
convert it to a Tuple of slices.
Expand Down Expand Up @@ -193,50 +194,50 @@ cdef _spaceid_to_slice(space_id: hid_t):
raise NotImplementedError("Point selections are not yet supported")


@cython.infer_types(True)
cpdef build_data_dict(dcpl, raw_data_name: str):
"""
Function to build the "data_dict" of a versioned virtual dataset.
All virtual datasets created by versioned-hdf5 should have chunks in
exactly one raw dataset `raw_data_name` in the same file.
This function blindly assumes this is the case.
:param dcpl: the dataset creation property list of the versioned dataset
:param raw_data_name: the name of the corresponding raw dataset
:return: a dictionary mapping the `Tuple` of the virtual dataset chunk
to a `Slice` in the raw dataset.
"""
data_dict = {}

with phil:
dcpl_id: hid_t = dcpl.id
virtual_count: size_t = dcpl.get_virtual_count()

for j in range(virtual_count):
vspace_id = H5Pget_virtual_vspace(dcpl_id, j)
if vspace_id == H5I_INVALID_HID:
raise HDF5Error()
try:
vspace_slice_tuple = _spaceid_to_slice(vspace_id)
finally:
if H5Sclose(vspace_id) < 0:
raise HDF5Error()

srcspace_id = H5Pget_virtual_srcspace(dcpl_id, j)
if srcspace_id == H5I_INVALID_HID:
raise HDF5Error()
try:
srcspace_slice_tuple = _spaceid_to_slice(srcspace_id)
finally:
if H5Sclose(srcspace_id) < 0:
raise HDF5Error()

# the slice into the raw_data (srcspace_slice_tuple) is only
# on the first axis
data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]

return data_dict
# @cython.infer_types(True)
# cpdef build_data_dict(dcpl, raw_data_name: str):
# """
# Function to build the "data_dict" of a versioned virtual dataset.
#
# All virtual datasets created by versioned-hdf5 should have chunks in
# exactly one raw dataset `raw_data_name` in the same file.
# This function blindly assumes this is the case.
#
# :param dcpl: the dataset creation property list of the versioned dataset
# :param raw_data_name: the name of the corresponding raw dataset
# :return: a dictionary mapping the `Tuple` of the virtual dataset chunk
# to a `Slice` in the raw dataset.
# """
# data_dict = {}
#
# with phil:
# dcpl_id: hid_t = dcpl.id
# virtual_count: size_t = dcpl.get_virtual_count()
#
# for j in range(virtual_count):
# vspace_id = H5Pget_virtual_vspace(dcpl_id, j)
# if vspace_id == H5I_INVALID_HID:
# raise HDF5Error()
# try:
# vspace_slice_tuple = _spaceid_to_slice(vspace_id)
# finally:
# if H5Sclose(vspace_id) < 0:
# raise HDF5Error()
#
# srcspace_id = H5Pget_virtual_srcspace(dcpl_id, j)
# if srcspace_id == H5I_INVALID_HID:
# raise HDF5Error()
# try:
# srcspace_slice_tuple = _spaceid_to_slice(srcspace_id)
# finally:
# if H5Sclose(srcspace_id) < 0:
# raise HDF5Error()
#
# # the slice into the raw_data (srcspace_slice_tuple) is only
# # on the first axis
# data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]
#
# return data_dict


cdef Exception HDF5Error():
Expand Down
4 changes: 3 additions & 1 deletion versioned_hdf5/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
)

from .backend import DEFAULT_CHUNK_SIZE
from .slicetools import build_data_dict

# from .slicetools import build_data_dict
from .data_dict import build_data_dict
from .subchunk_map import as_subchunk_map

_groups = WeakValueDictionary({})
Expand Down

0 comments on commit 1ebee1d

Please sign in to comment.