Break build_data_dict out into separate python implementation

deshaw · Oct 15, 2024 · 1ebee1d · 1ebee1d
1 parent 8c6fa28
commit 1ebee1d
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 47 deletions.
diff --git a/versioned_hdf5/data_dict.py b/versioned_hdf5/data_dict.py
@@ -0,0 +1,33 @@
+from h5py import h5p
+from h5py._hl.base import phil
+
+from .slicetools import _spaceid_to_slice
+
+
+def build_data_dict(dcpl: h5p.PropDCID, raw_data_name: str):
+    """Build the data_dict of a versioned virtual dataset.
+
+    All virtual datasets created by versioned-hdf5 should have chunks in
+    exactly one raw dataset `raw_data_name` in the same file.
+    This function blindly assumes this is the case.
+
+    :param dcpl: the dataset creation property list of the versioned dataset
+    :param raw_data_name: the name of the corresponding raw dataset
+    :return: a dictionary mapping the `Tuple` of the virtual dataset chunk
+        to a `Slice` in the raw dataset.
+    """
+    data_dict: dict = {}
+
+    with phil:
+        for j in range(dcpl.get_virtual_count()):
+            vspace = dcpl.get_virtual_vspace(j)
+            srcspace = dcpl.get_virtual_srcspace(j)
+
+            vspace_slice_tuple = _spaceid_to_slice(vspace.id)
+            srcspace_slice_tuple = _spaceid_to_slice(srcspace.id)
+
+            # the slice into the raw_data (srcspace_slice_tuple) is only
+            # on the first axis
+            data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]
+
+    return data_dict
diff --git a/versioned_hdf5/meson.build b/versioned_hdf5/meson.build
@@ -7,6 +7,7 @@ py.install_sources(
         'replay.py',
         'versions.py',
         'wrappers.py',
+        'data_dict.py',
     ],
     subdir: 'versioned_hdf5',
 )

diff --git a/versioned_hdf5/slicetools.pyx b/versioned_hdf5/slicetools.pyx
@@ -1,3 +1,4 @@
+# cython: linetrace=True
 import sys
 from functools import lru_cache
 
@@ -113,7 +114,7 @@ np_haddr_t = np.longlong
 NP_GE_200 = np.lib.NumpyVersion(np.__version__) >= "2.0.0"
 
 
-def spaceid_to_slice(space) -> Tuple:
+def spaceid_to_slice(space: h5s.SpaceID) -> Tuple:
     """
     Convert an h5py spaceid object into an ndindex index
 
@@ -146,7 +147,7 @@ def hyperslab_to_slice(start, stride, count, block):
 
 
 @cython.infer_types(True)
-cdef _spaceid_to_slice(space_id: hid_t):
+cpdef _spaceid_to_slice(space_id: hid_t):
     """
     Helper function to read the data for `space_id` selection and
     convert it to a Tuple of slices.
@@ -193,50 +194,50 @@ cdef _spaceid_to_slice(space_id: hid_t):
         raise NotImplementedError("Point selections are not yet supported")
 
 
-@cython.infer_types(True)
-cpdef build_data_dict(dcpl, raw_data_name: str):
-    """
-    Function to build the "data_dict" of a versioned virtual dataset.
-
-    All virtual datasets created by versioned-hdf5 should have chunks in
-    exactly one raw dataset `raw_data_name` in the same file.
-    This function blindly assumes this is the case.
-
-    :param dcpl: the dataset creation property list of the versioned dataset
-    :param raw_data_name: the name of the corresponding raw dataset
-    :return: a dictionary mapping the `Tuple` of the virtual dataset chunk
-        to a `Slice` in the raw dataset.
-    """
-    data_dict = {}
-
-    with phil:
-        dcpl_id: hid_t = dcpl.id
-        virtual_count: size_t = dcpl.get_virtual_count()
-
-        for j in range(virtual_count):
-            vspace_id = H5Pget_virtual_vspace(dcpl_id, j)
-            if vspace_id == H5I_INVALID_HID:
-                raise HDF5Error()
-            try:
-                vspace_slice_tuple = _spaceid_to_slice(vspace_id)
-            finally:
-                if H5Sclose(vspace_id) < 0:
-                    raise HDF5Error()
-
-            srcspace_id = H5Pget_virtual_srcspace(dcpl_id, j)
-            if srcspace_id == H5I_INVALID_HID:
-                raise HDF5Error()
-            try:
-                srcspace_slice_tuple = _spaceid_to_slice(srcspace_id)
-            finally:
-                if H5Sclose(srcspace_id) < 0:
-                    raise HDF5Error()
-
-            # the slice into the raw_data (srcspace_slice_tuple) is only
-            # on the first axis
-            data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]
-
-    return data_dict
+# @cython.infer_types(True)
+# cpdef build_data_dict(dcpl, raw_data_name: str):
+#     """
+#     Function to build the "data_dict" of a versioned virtual dataset.
+#
+#     All virtual datasets created by versioned-hdf5 should have chunks in
+#     exactly one raw dataset `raw_data_name` in the same file.
+#     This function blindly assumes this is the case.
+#
+#     :param dcpl: the dataset creation property list of the versioned dataset
+#     :param raw_data_name: the name of the corresponding raw dataset
+#     :return: a dictionary mapping the `Tuple` of the virtual dataset chunk
+#         to a `Slice` in the raw dataset.
+#     """
+#     data_dict = {}
+#
+#     with phil:
+#         dcpl_id: hid_t = dcpl.id
+#         virtual_count: size_t = dcpl.get_virtual_count()
+#
+#         for j in range(virtual_count):
+#             vspace_id = H5Pget_virtual_vspace(dcpl_id, j)
+#             if vspace_id == H5I_INVALID_HID:
+#                 raise HDF5Error()
+#             try:
+#                 vspace_slice_tuple = _spaceid_to_slice(vspace_id)
+#             finally:
+#                 if H5Sclose(vspace_id) < 0:
+#                     raise HDF5Error()
+#
+#             srcspace_id = H5Pget_virtual_srcspace(dcpl_id, j)
+#             if srcspace_id == H5I_INVALID_HID:
+#                 raise HDF5Error()
+#             try:
+#                 srcspace_slice_tuple = _spaceid_to_slice(srcspace_id)
+#             finally:
+#                 if H5Sclose(srcspace_id) < 0:
+#                     raise HDF5Error()
+#
+#             # the slice into the raw_data (srcspace_slice_tuple) is only
+#             # on the first axis
+#             data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0]
+#
+#     return data_dict
 
 
 cdef Exception HDF5Error():

diff --git a/versioned_hdf5/wrappers.py b/versioned_hdf5/wrappers.py
@@ -33,7 +33,9 @@
 )
 
 from .backend import DEFAULT_CHUNK_SIZE
-from .slicetools import build_data_dict
+
+# from .slicetools import build_data_dict
+from .data_dict import build_data_dict
 from .subchunk_map import as_subchunk_map
 
 _groups = WeakValueDictionary({})