deshaw · peytondmurray · Aug 5, 2024 · Jun 10, 2024 · Jul 26, 2024 · Jul 27, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -54,10 +54,11 @@ jobs:
           set -xe
           h5cc -showconfig
 
-      - name: Install versioned-hdf5 test packages
+      - name: Install versioned-hdf5 test packages and extra h5py compression types
         run: |
           set -xe
           pip install .[test]
+          pip install hdf5plugin tables
 
       - name: Run Tests
         run: |

diff --git a/pyproject.toml b/pyproject.toml
@@ -96,7 +96,7 @@ ignore = [
 
 [project.optional-dependencies]
 dev = ["pre-commit>=3.6.0", 'cython', 'meson-python', 'setuptools-scm']
-test = ["pytest", "pytest-env", "hypothesis"]
+test = ["pytest", "pytest-env", "hypothesis", "packaging"]
 doc = ["sphinx", "sphinx-multiversion", "myst-parser"]
 
 [tool.setuptools_scm]

diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py
@@ -1,9 +1,10 @@
 import logging
 import os
-from typing import Dict, Optional
+import textwrap
+from typing import Dict, Iterator, Optional
 
 import numpy as np
-from h5py import Dataset, VirtualLayout, VirtualSource, h5s
+from h5py import Dataset, VirtualLayout, VirtualSource, h5s, h5z
 from h5py._hl.filters import guess_chunk
 from ndindex import ChunkSize, Slice, Tuple, ndindex
 from numpy.testing import assert_array_equal
@@ -154,12 +155,18 @@ def write_dataset(
 
     if (
         compression
-        and compression != ds.compression
+        and compression not in ds._filters
         or compression_opts
-        and compression_opts != ds.compression_opts
+        and compression_opts != ds._filters[ds.compression]
     ):
+        available_filters = textwrap.indent(
+            "\n".join(str(filter) for filter in get_available_filters()), "  "
+        )
         raise ValueError(
-            "Compression options can only be specified for the first version of a dataset"
+            "Compression options can only be specified for the first version of a dataset.\n"
+            f"Dataset: {name}\n"
+            f"Current filters: {ds._filters}\n"
+            f"Available hdf5 compression types:\n{available_filters}"
         )
     if fillvalue is not None and fillvalue != ds.fillvalue:
         dtype = ds.dtype
@@ -513,3 +520,18 @@ def create_virtual_dataset(
     virtual_data.attrs["raw_data"] = raw_data.name
     virtual_data.attrs["chunks"] = raw_data.chunks
     return virtual_data
+
+
+def get_available_filters() -> Iterator[int]:
+    """Retrieve all of the registered h5py filters.
+
+    Returns
+    -------
+    Iterator[int]
+        Filter ID numbers; each filter has a dedicated ID - see
+        the docs for the particular filter being used for more information
+        about these
+    """
+    for i in range(65536):
+        if h5z.filter_avail(i):
+            yield i
diff --git a/versioned_hdf5/meson.build b/versioned_hdf5/meson.build
@@ -11,7 +11,7 @@ py.install_sources(
     subdir: 'versioned_hdf5',
 )
 
-slicetools_deps = [
+compiled_deps = [
     dependency('hdf5'),
     dependency('mpi'),
 ]
@@ -23,7 +23,7 @@ py.extension_module(
     ],
     install: true,
     subdir: 'versioned_hdf5',
-    dependencies: slicetools_deps,
+    dependencies: compiled_deps,
     cython_args: ['--cplus'],
     override_options : ['cython_language=cpp'],
 )
diff --git a/versioned_hdf5/replay.py b/versioned_hdf5/replay.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import gc
+import logging
 import posixpath
 from copy import deepcopy
 from typing import Any, Dict, Iterable, List, Optional, Union
@@ -35,6 +36,8 @@
     _groups,
 )
 
+logger = logging.getLogger(__name__)
+
 
 def recreate_dataset(f, name, newf, callback=None):
     """
@@ -88,6 +91,20 @@ def recreate_dataset(f, name, newf, callback=None):
             chunks = dataset.chunks
             compression = dataset.compression
             compression_opts = dataset.compression_opts
+
+            if compression is None and getattr(dataset, "_filters", None):
+                # If we're using nondefault compression, there's no way of knowing
+                # whether the first filter is a valid compression or some other
+                # kind of filter, so we issue a warning about assuming that it is
+                # the dataset's compression.
+                compression = list(dataset._filters)[0]
+                compression_opts = dataset._filters[compression]
+                logger.warning(
+                    "No default compression detected in this dataset. "
+                    f"Using first filter {compression} and options "
+                    f"{compression_opts} for compression."
+                )
+
             fillvalue = dataset.fillvalue
             attrs = dataset.attrs
             if first:

diff --git a/versioned_hdf5/tests/test_api.py b/versioned_hdf5/tests/test_api.py
@@ -1,4 +1,5 @@
 import datetime
+import importlib.metadata
 import itertools
 import logging
 import os
@@ -8,8 +9,10 @@
 
 import h5py
 import numpy as np
+import pytest
 from h5py._hl.filters import guess_chunk
 from numpy.testing import assert_equal
+from packaging.version import Version
 from pytest import mark, raises
 
 from ..api import VersionedHDF5File
@@ -2863,3 +2866,66 @@ def test_verify_string_chunk_reuse_bytes_one_dimensional(tmp_path):
             f["_version_data/values/raw_data"][:].astype(object),
             np.array([b"a", b"b", b"c"]).astype(object),
         )
+
+
+@pytest.mark.parametrize(
+    ("library"),
+    [
+        "hdf5plugin",
+        "tables",
+    ],
+)
+def test_other_compression_bad_value(tmp_path, library):
+    """Test that invalid compression types do not validate."""
+    if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
+        "2"
+    ):
+        pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
+    pytest.importorskip(library)
+    path = tmp_path / "tmp.h5"
+    with h5py.File(path, "w") as f:
+        vf = VersionedHDF5File(f)
+        with vf.stage_version("r0") as sv, pytest.raises(ValueError, match="invalid"):
+            sv.create_dataset(
+                "values",
+                data=np.arange(10),
+                compression=-1,
+                compression_opts=(0, 0, 0, 0, 7, 1, 2),
+            )
+
+
+@pytest.mark.parametrize(
+    ("library"),
+    [
+        "hdf5plugin",
+        "tables",
+    ],
+)
+def test_other_compression_validates(tmp_path, library):
+    """Test that other compression types validate correctly."""
+    if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
+        "2"
+    ):
+        pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
+    pytest.importorskip(library)
+
+    path = tmp_path / "tmp.h5"
+    with h5py.File(path, "w") as f:
+        vf = VersionedHDF5File(f)
+        with vf.stage_version("r0") as sv:
+            sv.create_dataset(
+                "values",
+                data=np.arange(10),
+                compression=32001,
+                compression_opts=(0, 0, 0, 0, 7, 1, 2),
+            )
+
+    with h5py.File(path, "r+") as f:
+        assert f["_version_data/versions/r0/values"].compression is None
+        raw_data = f["_version_data/values/raw_data"]
+        assert raw_data.compression is None
+        assert "32001" in raw_data._filters
+
+        # First four numbers are reserved for blosc compression;
+        # others are actual compression options
+        assert raw_data._filters["32001"][4:] == (7, 1, 2)
diff --git a/versioned_hdf5/tests/test_replay.py b/versioned_hdf5/tests/test_replay.py
@@ -1,3 +1,4 @@
+import importlib.metadata
 import pathlib
 import shutil
 import subprocess
@@ -6,6 +7,7 @@
 import h5py
 import numpy as np
 import pytest
+from packaging.version import Version
 
 from versioned_hdf5 import VersionedHDF5File
 from versioned_hdf5.hashtable import Hashtable
@@ -163,7 +165,7 @@ def test_modify_metadata_compression(vfile):
     assert set(f["_version_data"]["group"]) == {"test_data4"}
 
 
-def test_modify_metadata_compressio2(vfile):
+def test_modify_metadata_compression2(vfile):
     setup_vfile(vfile)
 
     f = vfile.f
@@ -1043,3 +1045,150 @@ def test_delete_versions_speed(vfile):
     # keeping has to go up 9 versions from it's current previous version, for
     # a total of 90 calls.
     assert mock_get_parent.call_count == 90
+
+
+@pytest.mark.parametrize(
+    ("obj", "metadata_opts"),
+    [
+        ("test_data2", {"compression": "gzip", "compression_opts": 3}),
+        ("group/test_data4", {"compression": "gzip", "compression_opts": 3}),
+    ],
+)
+def test_modify_metadata_compression_default_compression(vfile, obj, metadata_opts):
+    """Test that setting compression via modify_metadata works for default compression."""
+    setup_vfile(vfile)
+
+    f = vfile.f
+
+    # Check that the compression is unset for every dataset
+    for dataset in ["test_data", "test_data2", "group/test_data4"]:
+        for version in ["version1", "version2"]:
+            assert vfile[version][dataset].compression is None
+            assert vfile[version][dataset].compression_opts is None
+
+        assert f["_version_data"][dataset]["raw_data"].compression is None
+        assert f["_version_data"][dataset]["raw_data"].compression_opts is None
+
+    modify_metadata(f, obj, **metadata_opts)
+    check_data(vfile)
+
+    # Check that the compression is set for the group that had its metadata modified
+    for dataset in ["test_data", "test_data2", "group/test_data4"]:
+        for version in ["version1", "version2"]:
+            if dataset == obj:
+                assert (
+                    vfile[version][dataset].compression == metadata_opts["compression"]
+                )
+                assert (
+                    vfile[version][dataset].compression_opts
+                    == metadata_opts["compression_opts"]
+                )
+            else:
+                assert vfile[version][dataset].compression is None
+                assert vfile[version][dataset].compression_opts is None
+
+        if dataset == obj:
+            assert (
+                f["_version_data"][dataset]["raw_data"].compression
+                == metadata_opts["compression"]
+            )
+            assert (
+                f["_version_data"][dataset]["raw_data"].compression_opts
+                == metadata_opts["compression_opts"]
+            )
+        else:
+            assert f["_version_data"][dataset]["raw_data"].compression is None
+            assert f["_version_data"][dataset]["raw_data"].compression_opts is None
+
+    # Make sure the tmp group group has been destroyed.
+    assert set(f["_version_data"]) == {
+        "test_data",
+        "test_data2",
+        "test_data3",
+        "group",
+        "versions",
+    }
+    assert set(f["_version_data"]["group"]) == {"test_data4"}
+
+
+@pytest.mark.parametrize(
+    ("obj", "metadata_opts"),
+    [
+        (
+            "test_data2",
+            {"compression": 32001, "compression_opts": (0, 0, 0, 0, 7, 1, 2)},
+        ),
+        (
+            "group/test_data4",
+            {"compression": 32001, "compression_opts": (0, 0, 0, 0, 7, 1, 2)},
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    ("library"),
+    [
+        "hdf5plugin",
+        "tables",
+    ],
+)
+def test_modify_metadata_compression_nondefault_compression(
+    vfile, obj, metadata_opts, library
+):
+    """Test that setting compression via modify_metadata works for nondefault compression."""
+    if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
+        "2"
+    ):
+        pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
+    pytest.importorskip(library)
+
+    setup_vfile(vfile)
+
+    f = vfile.f
+
+    # Check that the compression is unset for every dataset
+    for dataset in ["test_data", "test_data2", "group/test_data4"]:
+        for version in ["version1", "version2"]:
+            assert vfile[version][dataset].compression is None
+            assert vfile[version][dataset].compression_opts is None
+
+        raw_data = f["_version_data"][dataset]["raw_data"]
+        assert raw_data.compression is None
+        assert raw_data.compression_opts is None
+
+    modify_metadata(f, obj, **metadata_opts)
+    check_data(vfile)
+
+    # Check that the compression is not set for the group that had its metadata modified;
+    # the compression of a virtual dataset does not get set from its parent
+    for dataset in ["test_data", "test_data2", "group/test_data4"]:
+        for version in ["version1", "version2"]:
+            if dataset == obj:
+                assert vfile[version][dataset].compression is None
+                assert vfile[version][dataset].compression_opts is None
+            else:
+                assert vfile[version][dataset].compression is None
+                assert vfile[version][dataset].compression_opts is None
+
+        raw_data = f["_version_data"][dataset]["raw_data"]
+        if dataset == obj:
+            assert raw_data.compression is None
+            assert raw_data.compression_opts is None
+
+            # Ignore the first four values; for blosc (id 32001) they are reserved
+            assert (
+                raw_data._filters[str(metadata_opts["compression"])][4:]
+                == metadata_opts["compression_opts"][4:]
+            )
+        else:
+            assert raw_data.compression is None
+            assert raw_data.compression_opts is None
+
+    # Make sure the tmp group group has been destroyed.
+    assert set(f["_version_data"]) == {
+        "test_data",
+        "test_data2",
+        "test_data3",
+        "group",
+        "versions",
+    }
+    assert set(f["_version_data"]["group"]) == {"test_data4"}