Skip to content

Commit

Permalink
Add support for zarr dimension_separator metadata field
Browse files Browse the repository at this point in the history
Historically zarr has named chunk files using dots by default,
e.g. "1.2.3", but has also supported slashes, e.g. "1/2/3".  However,
the choice was not indicated in the .zarray metadata file, and instead
needed to be specified out-of-band.  In TensorStore, that was
accomplished using the "key_encoding" spec member.

Zarr recently added a dimension_separator field to the metadata
format:

zarr-developers/zarr-python#715

This commit adds support for that field.  Note that previously
TensorStore did not accept .zarray files that contain unknown
members, meaning that reading arrays with the dimension_separator
field would fail.

The existing "key_encoding" spec member is still supported, but
deprecated in favor of specifying "dimension_separator" as part of the
"metadata" field.  When creating a new array, TensorStore will always
write the "dimension_separator" field, in order to allow the array to
be re-opened without specifying the "dimension_separator".

Additionally, this change also makes TensorStore accept (and preserve)
extra .zarray members, as the zarr spec has now also changed to
require that.

PiperOrigin-RevId: 384998911
Change-Id: I485141b7786084d57c28005e3076f70462cd5b9e
  • Loading branch information
jbms authored and copybara-github committed Jul 15, 2021
1 parent dc16da6 commit b14939b
Show file tree
Hide file tree
Showing 13 changed files with 327 additions and 111 deletions.
10 changes: 10 additions & 0 deletions python/tensorstore/tensorstore_class.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ Asynchronous multi-dimensional array handle.
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -321,6 +322,7 @@ Spec that may be used to re-open or re-create the array.
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1021,6 +1023,7 @@ that reflects the result of the indexing operation. To read data, call
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1148,6 +1151,7 @@ integer or boolean array indexing terms are applied orthogonally:
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1257,6 +1261,7 @@ dimensions are unconditionally added as the first dimensions of the result
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1378,6 +1383,7 @@ Computes a virtual view using an explicit :ref:`index transform<index-transform>
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<u4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1840,6 +1846,7 @@ Opens or creates a :py:class:`TensorStore` from a :py:class:`Spec`.
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<i4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -1950,6 +1957,7 @@ properties that are left unconstrained:
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<f4',
'fill_value': 42.0,
'filters': None,
Expand Down Expand Up @@ -1996,6 +2004,7 @@ determine a matching chunk layout automatically:
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '<f4',
'fill_value': None,
'filters': None,
Expand Down Expand Up @@ -2080,6 +2089,7 @@ schema constraints:
'id': 'blosc',
'shuffle': -1,
},
'dimension_separator': '.',
'dtype': '>f4',
'fill_value': None,
'filters': None,
Expand Down
1 change: 1 addition & 0 deletions tensorstore/driver/zarr/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ tensorstore_cc_library(
"//tensorstore:schema",
"//tensorstore/index_space:index_transform_builder",
"//tensorstore/internal:json",
"//tensorstore/internal:json_metadata_matching",
"//tensorstore/util:quote_string",
"//tensorstore/util:result",
"@com_github_nlohmann_json//:nlohmann_json",
Expand Down
69 changes: 48 additions & 21 deletions tensorstore/driver/zarr/driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,18 @@ namespace internal_zarr {
namespace {
constexpr const char kZarrMetadataKey[] = ".zarray";

inline char GetChunkKeyEncodingSeparator(ChunkKeyEncoding key_encoding) {
return key_encoding == ChunkKeyEncoding::kDotSeparated ? '.' : '/';
inline char GetDimensionSeparatorChar(DimensionSeparator dimension_separator) {
return dimension_separator == DimensionSeparator::kDotSeparated ? '.' : '/';
}

DimensionSeparator GetDimensionSeparator(
const ZarrPartialMetadata& partial_metadata, const ZarrMetadata& metadata) {
if (metadata.dimension_separator) {
return *metadata.dimension_separator;
} else if (partial_metadata.dimension_separator) {
return *partial_metadata.dimension_separator;
}
return DimensionSeparator::kDotSeparated;
}

Result<ZarrMetadataPtr> ParseEncodedMetadata(std::string_view encoded_value) {
Expand Down Expand Up @@ -95,15 +105,13 @@ class ZarrDriver
template <template <typename> class MaybeBound = internal::ContextUnbound>
struct SpecT : public internal_kvs_backed_chunk_driver::SpecT<MaybeBound> {
std::string key_prefix;
ChunkKeyEncoding key_encoding;
ZarrPartialMetadata partial_metadata;
SelectedField selected_field;

constexpr static auto ApplyMembers = [](auto& x, auto f) {
return f(internal::BaseCast<
internal_kvs_backed_chunk_driver::SpecT<MaybeBound>>(x),
x.key_prefix, x.key_encoding, x.partial_metadata,
x.selected_field);
x.key_prefix, x.partial_metadata, x.selected_field);
};
};

Expand All @@ -123,12 +131,25 @@ class ZarrDriver
internal_kvs_backed_chunk_driver::SpecJsonBinder,
jb::Member("path", jb::Projection(&SpecT<>::key_prefix,
jb::DefaultInitializedValue())),
jb::Member("key_encoding",
jb::Projection(
&SpecT<>::key_encoding,
jb::DefaultInitializedValue(ChunkKeyEncodingJsonBinder))),
jb::Member("metadata", jb::Projection(&SpecT<>::partial_metadata,
jb::DefaultInitializedValue())),
// Deprecated `key_encoding` property.
jb::LoadSave(jb::OptionalMember(
"key_encoding",
jb::Compose<DimensionSeparator>(
[](auto is_loading, const auto& options, auto* obj,
DimensionSeparator* value) {
auto& sep = obj->partial_metadata.dimension_separator;
if (sep && *sep != *value) {
return absl::InvalidArgumentError(tensorstore::StrCat(
"value (", ::nlohmann::json(*value).dump(),
") does not match value in metadata (",
::nlohmann::json(*sep).dump(), ")"));
}
sep = *value;
return absl::OkStatus();
},
DimensionSeparatorJsonBinder))),
jb::Member("field",
jb::Projection(&SpecT<>::selected_field,
jb::DefaultValue<jb::kNeverIncludeDefaults>(
Expand Down Expand Up @@ -224,12 +245,12 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache {

public:
explicit DataCache(Initializer initializer, std::string key_prefix,
ChunkKeyEncoding key_encoding)
DimensionSeparator dimension_separator)
: Base(initializer,
GetChunkGridSpecification(*static_cast<const ZarrMetadata*>(
initializer.metadata.get()))),
key_prefix_(std::move(key_prefix)),
key_encoding_(key_encoding) {}
dimension_separator_(dimension_separator) {}

Status ValidateMetadataCompatibility(const void* existing_metadata_ptr,
const void* new_metadata_ptr) override {
Expand Down Expand Up @@ -345,8 +366,8 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache {

std::string GetChunkStorageKey(const void* metadata,
span<const Index> cell_indices) override {
return internal::JoinPath(key_prefix_,
EncodeChunkIndices(cell_indices, key_encoding_));
return internal::JoinPath(
key_prefix_, EncodeChunkIndices(cell_indices, dimension_separator_));
}

Status GetBoundSpecData(
Expand All @@ -358,7 +379,6 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache {
const auto& metadata = *static_cast<const ZarrMetadata*>(metadata_ptr);
spec.key_prefix = key_prefix_;
spec.selected_field = EncodeSelectedField(component_index, metadata.dtype);
spec.key_encoding = key_encoding_;
auto& pm = spec.partial_metadata;
pm.rank = metadata.rank;
pm.zarr_format = metadata.zarr_format;
Expand All @@ -369,6 +389,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache {
pm.order = metadata.order;
pm.dtype = metadata.dtype;
pm.fill_value = metadata.fill_value;
pm.dimension_separator = dimension_separator_;
return absl::OkStatus();
}

Expand All @@ -391,7 +412,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache {

private:
std::string key_prefix_;
ChunkKeyEncoding key_encoding_;
DimensionSeparator dimension_separator_;
};

class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase {
Expand Down Expand Up @@ -427,15 +448,21 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase {
std::string GetDataCacheKey(const void* metadata) override {
std::string result;
const auto& spec = this->spec();
internal::EncodeCacheKey(&result, spec.key_prefix, spec.key_encoding,
*static_cast<const ZarrMetadata*>(metadata));
const auto& zarr_metadata = *static_cast<const ZarrMetadata*>(metadata);
internal::EncodeCacheKey(
&result, spec.key_prefix,
GetDimensionSeparator(spec.partial_metadata, zarr_metadata),
zarr_metadata);
return result;
}

std::unique_ptr<internal_kvs_backed_chunk_driver::DataCache> GetDataCache(
DataCache::Initializer initializer) override {
return std::make_unique<DataCache>(std::move(initializer),
spec().key_prefix, spec().key_encoding);
const auto& metadata =
*static_cast<const ZarrMetadata*>(initializer.metadata.get());
return std::make_unique<DataCache>(
std::move(initializer), spec().key_prefix,
GetDimensionSeparator(spec().partial_metadata, metadata));
}

Result<std::size_t> GetComponentIndex(const void* metadata_ptr,
Expand All @@ -456,8 +483,8 @@ const internal::DriverRegistration<ZarrDriver> registration;
} // namespace

std::string EncodeChunkIndices(span<const Index> indices,
ChunkKeyEncoding key_encoding) {
const char separator = GetChunkKeyEncodingSeparator(key_encoding);
DimensionSeparator dimension_separator) {
const char separator = GetDimensionSeparatorChar(dimension_separator);
std::string key;
for (DimensionIndex i = 0; i < indices.size(); ++i) {
if (i != 0) {
Expand Down
2 changes: 1 addition & 1 deletion tensorstore/driver/zarr/driver_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace internal_zarr {

/// Encodes a chunk grid index vector as a storage key suffix.
std::string EncodeChunkIndices(span<const Index> indices,
ChunkKeyEncoding key_encoding);
DimensionSeparator dimension_separator);

} // namespace internal_zarr
} // namespace tensorstore
Expand Down
6 changes: 3 additions & 3 deletions tensorstore/driver/zarr/driver_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ using tensorstore::Result;
using tensorstore::span;
using tensorstore::TransactionMode;
using tensorstore::internal_kvs_backed_chunk_driver::ResizeParameters;
using tensorstore::internal_zarr::ChunkKeyEncoding;
using tensorstore::internal_zarr::DimensionSeparator;
using tensorstore::internal_zarr::ZarrMetadata;

Result<tensorstore::IndexTransform<>> ResolveBoundsFromMetadata(
Expand Down Expand Up @@ -88,12 +88,12 @@ Result<ResizeParameters> GetResizeParameters(

TEST(EncodeChunkIndicesTest, DotSeparated) {
EXPECT_EQ("1.2.3", EncodeChunkIndices(span<const Index>({1, 2, 3}),
ChunkKeyEncoding::kDotSeparated));
DimensionSeparator::kDotSeparated));
}

TEST(EncodeChunkIndicesTest, SlashSeparated) {
EXPECT_EQ("1/2/3", EncodeChunkIndices(span<const Index>({1, 2, 3}),
ChunkKeyEncoding::kSlashSeparated));
DimensionSeparator::kSlashSeparated));
}

TEST(ResolveBoundsFromMetadataTest, Basic) {
Expand Down
Loading

0 comments on commit b14939b

Please sign in to comment.