Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix nondefault compression handling #358

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ jobs:
set -xe
h5cc -showconfig

- name: Install versioned-hdf5 test packages
- name: Install versioned-hdf5 test packages and extra h5py compression types
run: |
set -xe
pip install .[test]
pip install hdf5plugin tables

- name: Run Tests
run: |
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ ignore = [

[project.optional-dependencies]
dev = ["pre-commit>=3.6.0", 'cython', 'meson-python', 'setuptools-scm']
test = ["pytest", "pytest-env", "hypothesis"]
test = ["pytest", "pytest-env", "hypothesis", "packaging"]
doc = ["sphinx", "sphinx-multiversion", "myst-parser"]

[tool.setuptools_scm]
Expand Down
32 changes: 27 additions & 5 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import os
from typing import Dict, Optional
import textwrap
from typing import Dict, Iterator, Optional

import numpy as np
from h5py import Dataset, VirtualLayout, VirtualSource, h5s
from h5py import Dataset, VirtualLayout, VirtualSource, h5s, h5z
from h5py._hl.filters import guess_chunk
from ndindex import ChunkSize, Slice, Tuple, ndindex
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -154,12 +155,18 @@ def write_dataset(

if (
compression
and compression != ds.compression
and compression not in ds._filters
or compression_opts
and compression_opts != ds.compression_opts
and compression_opts != ds._filters[ds.compression]
):
available_filters = textwrap.indent(
"\n".join(str(filter) for filter in get_available_filters()), " "
)
raise ValueError(
"Compression options can only be specified for the first version of a dataset"
"Compression options can only be specified for the first version of a dataset.\n"
f"Dataset: {name}\n"
f"Current filters: {ds._filters}\n"
f"Available hdf5 compression types:\n{available_filters}"
)
if fillvalue is not None and fillvalue != ds.fillvalue:
dtype = ds.dtype
Expand Down Expand Up @@ -513,3 +520,18 @@ def create_virtual_dataset(
virtual_data.attrs["raw_data"] = raw_data.name
virtual_data.attrs["chunks"] = raw_data.chunks
return virtual_data


def get_available_filters() -> Iterator[int]:
"""Retrieve all of the registered h5py filters.

Returns
-------
Iterator[int]
Filter ID numbers; each filter has a dedicated ID - see
the docs for the particular filter being used for more information
about these
"""
for i in range(65536):
if h5z.filter_avail(i):
yield i
4 changes: 2 additions & 2 deletions versioned_hdf5/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ py.install_sources(
subdir: 'versioned_hdf5',
)

slicetools_deps = [
compiled_deps = [
dependency('hdf5'),
dependency('mpi'),
]
Expand All @@ -23,7 +23,7 @@ py.extension_module(
],
install: true,
subdir: 'versioned_hdf5',
dependencies: slicetools_deps,
dependencies: compiled_deps,
cython_args: ['--cplus'],
override_options : ['cython_language=cpp'],
)
17 changes: 17 additions & 0 deletions versioned_hdf5/replay.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import gc
import logging
import posixpath
from copy import deepcopy
from typing import Any, Dict, Iterable, List, Optional, Union
Expand Down Expand Up @@ -35,6 +36,8 @@
_groups,
)

logger = logging.getLogger(__name__)


def recreate_dataset(f, name, newf, callback=None):
"""
Expand Down Expand Up @@ -88,6 +91,20 @@ def recreate_dataset(f, name, newf, callback=None):
chunks = dataset.chunks
compression = dataset.compression
compression_opts = dataset.compression_opts

if compression is None and getattr(dataset, "_filters", None):
# If we're using nondefault compression, there's no way of knowing
# whether the first filter is a valid compression or some other
# kind of filter, so we issue a warning about assuming that it is
# the dataset's compression.
compression = list(dataset._filters)[0]
compression_opts = dataset._filters[compression]
logger.warning(
"No default compression detected in this dataset. "
f"Using first filter {compression} and options "
f"{compression_opts} for compression."
)

fillvalue = dataset.fillvalue
attrs = dataset.attrs
if first:
Expand Down
66 changes: 66 additions & 0 deletions versioned_hdf5/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import importlib.metadata
import itertools
import logging
import os
Expand All @@ -8,8 +9,10 @@

import h5py
import numpy as np
import pytest
from h5py._hl.filters import guess_chunk
from numpy.testing import assert_equal
from packaging.version import Version
from pytest import mark, raises

from ..api import VersionedHDF5File
Expand Down Expand Up @@ -2863,3 +2866,66 @@ def test_verify_string_chunk_reuse_bytes_one_dimensional(tmp_path):
f["_version_data/values/raw_data"][:].astype(object),
np.array([b"a", b"b", b"c"]).astype(object),
)


@pytest.mark.parametrize(
("library"),
[
"hdf5plugin",
"tables",
],
)
def test_other_compression_bad_value(tmp_path, library):
"""Test that invalid compression types do not validate."""
if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
"2"
):
pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
pytest.importorskip(library)
path = tmp_path / "tmp.h5"
with h5py.File(path, "w") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r0") as sv, pytest.raises(ValueError, match="invalid"):
sv.create_dataset(
"values",
data=np.arange(10),
compression=-1,
compression_opts=(0, 0, 0, 0, 7, 1, 2),
)


@pytest.mark.parametrize(
("library"),
[
"hdf5plugin",
"tables",
],
)
def test_other_compression_validates(tmp_path, library):
"""Test that other compression types validate correctly."""
if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
"2"
):
pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
pytest.importorskip(library)

path = tmp_path / "tmp.h5"
with h5py.File(path, "w") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r0") as sv:
sv.create_dataset(
"values",
data=np.arange(10),
compression=32001,
compression_opts=(0, 0, 0, 0, 7, 1, 2),
)

with h5py.File(path, "r+") as f:
assert f["_version_data/versions/r0/values"].compression is None
raw_data = f["_version_data/values/raw_data"]
assert raw_data.compression is None
assert "32001" in raw_data._filters

# First four numbers are reserved for blosc compression;
# others are actual compression options
assert raw_data._filters["32001"][4:] == (7, 1, 2)
151 changes: 150 additions & 1 deletion versioned_hdf5/tests/test_replay.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import importlib.metadata
import pathlib
import shutil
import subprocess
Expand All @@ -6,6 +7,7 @@
import h5py
import numpy as np
import pytest
from packaging.version import Version

from versioned_hdf5 import VersionedHDF5File
from versioned_hdf5.hashtable import Hashtable
Expand Down Expand Up @@ -163,7 +165,7 @@ def test_modify_metadata_compression(vfile):
assert set(f["_version_data"]["group"]) == {"test_data4"}


def test_modify_metadata_compressio2(vfile):
def test_modify_metadata_compression2(vfile):
setup_vfile(vfile)

f = vfile.f
Expand Down Expand Up @@ -1043,3 +1045,150 @@ def test_delete_versions_speed(vfile):
# keeping has to go up 9 versions from it's current previous version, for
# a total of 90 calls.
assert mock_get_parent.call_count == 90


@pytest.mark.parametrize(
("obj", "metadata_opts"),
[
("test_data2", {"compression": "gzip", "compression_opts": 3}),
("group/test_data4", {"compression": "gzip", "compression_opts": 3}),
],
)
def test_modify_metadata_compression_default_compression(vfile, obj, metadata_opts):
"""Test that setting compression via modify_metadata works for default compression."""
setup_vfile(vfile)

f = vfile.f

# Check that the compression is unset for every dataset
for dataset in ["test_data", "test_data2", "group/test_data4"]:
for version in ["version1", "version2"]:
assert vfile[version][dataset].compression is None
assert vfile[version][dataset].compression_opts is None

assert f["_version_data"][dataset]["raw_data"].compression is None
assert f["_version_data"][dataset]["raw_data"].compression_opts is None

modify_metadata(f, obj, **metadata_opts)
check_data(vfile)

# Check that the compression is set for the group that had its metadata modified
for dataset in ["test_data", "test_data2", "group/test_data4"]:
for version in ["version1", "version2"]:
if dataset == obj:
assert (
vfile[version][dataset].compression == metadata_opts["compression"]
)
assert (
vfile[version][dataset].compression_opts
== metadata_opts["compression_opts"]
)
else:
assert vfile[version][dataset].compression is None
assert vfile[version][dataset].compression_opts is None

if dataset == obj:
assert (
f["_version_data"][dataset]["raw_data"].compression
== metadata_opts["compression"]
)
assert (
f["_version_data"][dataset]["raw_data"].compression_opts
== metadata_opts["compression_opts"]
)
else:
assert f["_version_data"][dataset]["raw_data"].compression is None
assert f["_version_data"][dataset]["raw_data"].compression_opts is None

# Make sure the tmp group group has been destroyed.
assert set(f["_version_data"]) == {
"test_data",
"test_data2",
"test_data3",
"group",
"versions",
}
assert set(f["_version_data"]["group"]) == {"test_data4"}


@pytest.mark.parametrize(
("obj", "metadata_opts"),
[
(
"test_data2",
{"compression": 32001, "compression_opts": (0, 0, 0, 0, 7, 1, 2)},
),
(
"group/test_data4",
{"compression": 32001, "compression_opts": (0, 0, 0, 0, 7, 1, 2)},
),
],
)
@pytest.mark.parametrize(
("library"),
[
"hdf5plugin",
"tables",
],
)
def test_modify_metadata_compression_nondefault_compression(
vfile, obj, metadata_opts, library
):
"""Test that setting compression via modify_metadata works for nondefault compression."""
if library == "tables" and Version(importlib.metadata.version("numpy")) >= Version(
"2"
):
pytest.skip("Skipping test; pytables is incompatible with numpy>=2")
pytest.importorskip(library)

setup_vfile(vfile)

f = vfile.f

# Check that the compression is unset for every dataset
for dataset in ["test_data", "test_data2", "group/test_data4"]:
for version in ["version1", "version2"]:
assert vfile[version][dataset].compression is None
assert vfile[version][dataset].compression_opts is None

raw_data = f["_version_data"][dataset]["raw_data"]
assert raw_data.compression is None
assert raw_data.compression_opts is None

modify_metadata(f, obj, **metadata_opts)
check_data(vfile)

# Check that the compression is not set for the group that had its metadata modified;
# the compression of a virtual dataset does not get set from its parent
for dataset in ["test_data", "test_data2", "group/test_data4"]:
for version in ["version1", "version2"]:
if dataset == obj:
assert vfile[version][dataset].compression is None
assert vfile[version][dataset].compression_opts is None
else:
assert vfile[version][dataset].compression is None
assert vfile[version][dataset].compression_opts is None

raw_data = f["_version_data"][dataset]["raw_data"]
if dataset == obj:
assert raw_data.compression is None
assert raw_data.compression_opts is None

# Ignore the first four values; for blosc (id 32001) they are reserved
assert (
raw_data._filters[str(metadata_opts["compression"])][4:]
== metadata_opts["compression_opts"][4:]
)
else:
assert raw_data.compression is None
assert raw_data.compression_opts is None

# Make sure the tmp group group has been destroyed.
assert set(f["_version_data"]) == {
"test_data",
"test_data2",
"test_data3",
"group",
"versions",
}
assert set(f["_version_data"]["group"]) == {"test_data4"}
Loading