Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle strings in _verify_new_chunk_reuse (PyInf#11487) #338

Closed
ArvidJB opened this issue Jun 7, 2024 · 1 comment · Fixed by #348
Closed

Handle strings in _verify_new_chunk_reuse (PyInf#11487) #338

ArvidJB opened this issue Jun 7, 2024 · 1 comment · Fixed by #348

Comments

@ArvidJB
Copy link
Collaborator

ArvidJB commented Jun 7, 2024

The check in verify_chunk_reuse does not handle strings correctly:

In [3]: with TempDirCtx() as d:
   ...:     filename = d / 'data.h5'
   ...:     with h5py.File(filename, mode="w") as f:
   ...:         vf = VersionedHDF5File(f)
   ...:         with vf.stage_version("r0") as group:
   ...:             group.create_dataset(
   ...:                 "values",
   ...:                 data=np.array(['a', 'b', 'c', 'a', 'b', 'c'], dtype='O'),
   ...:                 dtype=h5py.string_dtype(length=None),
   ...:                 maxshape=(None,),
   ...:                 chunks=(3,),
   ...:             )
   ...:
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:305, in _verify_new_chunk_reuse(raw_dataset, new_data, data_hash, hashed_slice, chunk_being_written, slices_to_write, data_to_write)
    304 try:
--> 305     assert_array_equal(reused_chunk, to_be_written)
    306 except AssertionError as e:

    [... skipping hidden 1 frame]

File /opt/python/python-3.11/lib64/python3.11/contextlib.py:81, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     80 with self._recreate_cm():
---> 81     return func(*args, **kwds)

File /usr/local/python/python-3.11/std/lib64/python3.11/site-packages/numpy/testing/_private/utils.py:862, in assert_array_compare(comparison, x, y, err_msg, verbose, header, precision, equal_nan, equal_inf, strict)
    859         msg = build_err_msg([ox, oy], err_msg,
    860                             verbose=verbose, header=header,
    861                             names=('x', 'y'), precision=precision)
--> 862         raise AssertionError(msg)
    863 except ValueError:

AssertionError:
Arrays are not equal

Mismatched elements: 3 / 3 (100%)
 x: array(['a', 'b', 'c'], dtype=object)
 y: array([b'a', b'b', b'c'], dtype='|S1')

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
Cell In[3], line 5
      3 with h5py.File(filename, mode="w") as f:
      4     vf = VersionedHDF5File(f)
----> 5     with vf.stage_version("r0") as group:
      6         group.create_dataset(
      7             "values",
      8             data=np.array(['a', 'b', 'c', 'a', 'b', 'c'], dtype='O'),
   (...)
     11             chunks=(3,),
     12         )

File /opt/python/python-3.11/lib64/python3.11/contextlib.py:144, in _GeneratorContextManager.__exit__(self, typ, value, traceback)
    142 if typ is None:
    143     try:
--> 144         next(self.gen)
    145     except StopIteration:
    146         return False

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/api.py:307, in VersionedHDF5File.stage_version(self, version_name, prev_version, make_current, timestamp, verify_chunk_reuse)
    305     yield group
    306     group.close()
--> 307     commit_version(
    308         group,
    309         group.datasets(),
    310         make_current=make_current,
    311         chunks=group.chunks,
    312         compression=group.compression,
    313         compression_opts=group.compression_opts,
    314         timestamp=timestamp,
    315         verify_chunk_reuse=verify_chunk_reuse,
    316     )
    317 except:
    318     delete_version(self.f, version_name, old_current)

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/versions.py:164, in commit_version(version_group, datasets, make_current, chunks, compression, compression_opts, timestamp, verify_chunk_reuse)
    162     slices = write_dataset_chunks(f, name, data.data_dict, verify_chunk_reuse=verify_chunk_reuse)
    163 else:
--> 164     slices = write_dataset(
    165         f,
    166         name,
    167         data,
    168         chunks=chunks[name],
    169         compression=compression[name],
    170         compression_opts=compression_opts[name],
    171         fillvalue=fillvalue,
    172         verify_chunk_reuse=verify_chunk_reuse,
    173     )
    174 if shape is None:
    175     if isinstance(data, dict):

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:126, in write_dataset(f, name, data, chunks, dtype, compression, compression_opts, fillvalue, verify_chunk_reuse)
    114 def write_dataset(
    115     f,
    116     name,
   (...)
    123     verify_chunk_reuse=True,
    124 ):
    125     if name not in f["_version_data"]:
--> 126         return create_base_dataset(
    127             f,
    128             name,
    129             data=data,
    130             dtype=dtype,
    131             chunks=chunks,
    132             compression=compression,
    133             compression_opts=compression_opts,
    134             fillvalue=fillvalue,
    135         )
    137     ds = f["_version_data"][name]["raw_data"]
    138     if isinstance(chunks, int) and not isinstance(chunks, bool):

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:111, in create_base_dataset(f, name, shape, data, dtype, chunks, compression, compression_opts, fillvalue)
    100 dataset = group.create_dataset(
    101     "raw_data",
    102     shape=(0,) + chunks[1:],
   (...)
    108     fillvalue=fillvalue,
    109 )
    110 dataset.attrs["chunks"] = chunks
--> 111 return write_dataset(f, name, data, chunks=chunks)

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:193, in write_dataset(f, name, data, chunks, dtype, compression, compression_opts, fillvalue, verify_chunk_reuse)
    190     slices[data_slice] = hashed_slice
    192     if verify_chunk_reuse:
--> 193         _verify_new_chunk_reuse(
    194             raw_dataset=ds,
    195             new_data=data,
    196             data_hash=data_hash,
    197             hashed_slice=hashed_slice,
    198             chunk_being_written=data_s,
    199             slices_to_write=slices_to_write,
    200         )
    202     chunks_reused += 1
    204 else:

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:307, in _verify_new_chunk_reuse(raw_dataset, new_data, data_hash, hashed_slice, chunk_being_written, slices_to_write, data_to_write)
    305     assert_array_equal(reused_chunk, to_be_written)
    306 except AssertionError as e:
--> 307     raise ValueError(
    308         f"Hash {data_hash} of existing data chunk {reused_chunk} "
    309         f"matches the hash of new data chunk {chunk_being_written}, "
    310         "but data does not."
    311     ) from e

ValueError: Hash b'\x8b\x8a\xc9Zi\xbd\xa8\xcc\xa8\xcd:~\xf2\x1d\x81\xeb\n\xd7\xa0F9\xdb\x10\xbe\x8d[s\xc0\xf00\xb1}' of existing data chunk ['a' 'b' 'c'] matches the hash of new data chunk ['a' 'b' 'c'], but data does not.
@peytondmurray
Copy link
Collaborator

peytondmurray commented Jun 21, 2024

Hmm, so it looks like reused slices that are read from the underlying dataset contain bytes inside an object dtype array, but reused slices which are yet to be written to the file contain strings inside an object dtype array.

 x: array(['a', 'b', 'c'], dtype=object)
 y: array([b'a', b'b', b'c'], dtype='|S1')

I think we should be able to coerce both the data to be reused and the data the user is trying to write to bytes, and that should take care of this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants