diff --git a/src/04-modality-specific-files/10-microscopy.md b/src/04-modality-specific-files/10-microscopy.md index 5515559fdc..015d44bb0c 100644 --- a/src/04-modality-specific-files/10-microscopy.md +++ b/src/04-modality-specific-files/10-microscopy.md @@ -37,7 +37,8 @@ by the [Open Microscopy Environment](https://www.openmicroscopy.org/) for whole- the [OME-TIFF file specifications](https://docs.openmicroscopy.org/ome-model/6.1.2/ome-tiff/file-structure.html). The OME-TIFF file allows for multi-page TIFF files to store multiple image planes and supports multi-resolution pyramidal tiled images. An OME-XML data block is also embedded inside the -file’s header. +file’s header. Further, OME-ZARR (sometimes referred to as OME-NGFF or NGFF) has been developed to provide improved +access and storage for large data via chunked and compressed N-dimensional arrays. The BIDS standard accepts microscopy data in a number of file formats to accommodate datasets stored in 2D image formats and whole-slide imaging formats, to accommodate lossless and lossy @@ -54,12 +55,10 @@ Microscopy raw data MUST be stored in one of the following formats: (`.ome.tif` for standard TIFF files or `.ome.btf` for [BigTIFF](https://www.awaresystems.be/imaging/tiff/bigtiff.html) files) -If different from PNG, TIFF or OME-TIFF, the original unprocessed data in the native format MAY be -stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data). +- [OME-ZARR/NGFF](https://ngff.openmicroscopy.org/latest/) (`.ome.zarr` directories) -Future versions may extend this list of supported file formats, for example with the -Next-Generation File Formats currently developed by OME ([OME-NGFF](https://ngff.openmicroscopy.org/latest/)) -as a successor to OME-TIFF for better remote sharing of large datasets. +If different from PNG, TIFF, OME-TIFF, or OME-ZARR, the original unprocessed data in the native format MAY be +stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data). ### Modality suffixes Microscopy data currently support the following imaging modalities: diff --git a/src/schema/objects/extensions.yaml b/src/schema/objects/extensions.yaml index f3d4d0dcf8..21aaa515ee 100644 --- a/src/schema/objects/extensions.yaml +++ b/src/schema/objects/extensions.yaml @@ -140,6 +140,13 @@ Used by KIT, Yokogawa, and Ricoh MEG systems. Successor to the `.sqd` extension for marker files. +.ome.zarr/: + name: OME Next Generation File Format + description: | + An OME-NGFF file. + + OME-NGFF is a [Zarr](https://zarr.readthedocs.io)-based format, organizing data arrays in nested directories. + This format was developed by the Open Microscopy Environment to provide data stream access to very large data. .nii: name: NIfTI description: | diff --git a/src/schema/rules/datatypes/micr.yaml b/src/schema/rules/datatypes/micr.yaml index b5caf97b21..c08072a740 100644 --- a/src/schema/rules/datatypes/micr.yaml +++ b/src/schema/rules/datatypes/micr.yaml @@ -21,6 +21,7 @@ microscopy: extensions: - .ome.tif - .ome.btf + - .ome.zarr/ - .png - .tif - .json diff --git a/tools/schemacode/schemacode/tests/test_validator.py b/tools/schemacode/schemacode/tests/test_validator.py index ac9273dcf2..e16e5d5858 100644 --- a/tools/schemacode/schemacode/tests/test_validator.py +++ b/tools/schemacode/schemacode/tests/test_validator.py @@ -229,7 +229,7 @@ def test_load_all(): os.path.abspath(os.path.dirname(__file__)), "../data/schema", ) - schema_all = load_all(schema_path) + schema_all, _ = load_all(schema_path) # Check if expected keys are present in all entries for entry in schema_all: diff --git a/tools/schemacode/schemacode/validator.py b/tools/schemacode/schemacode/validator.py index 59c19f90d9..e98ee1a29c 100644 --- a/tools/schemacode/schemacode/validator.py +++ b/tools/schemacode/schemacode/validator.py @@ -16,7 +16,10 @@ DIR_ENTITIES = ["subject", "session"] -def _get_paths(bids_paths): +def _get_paths( + bids_paths, + pseudofile_suffixes=[], +): """ Get all paths from a list of directories, excluding hidden subdirectories from distribution. @@ -25,6 +28,9 @@ def _get_paths(bids_paths): bids_paths : list or str Directories from which to get paths, may also contain file paths, which will remain unchanged. + pseudofile_suffixes : list of str + Directory suffixes prompting the validation of the directory name and limiting further + directory walk. Notes ----- @@ -47,9 +53,6 @@ def _get_paths(bids_paths): ".bidsignore", "dandiset.yaml", ] - # Inelegant hard-coded solution. - # Could be replaced by a maximum depth limit if BIDS root auto-detection is implemented. - treat_as_file_suffix = [".ngff"] path_list = [] for bids_path in bids_paths: @@ -57,13 +60,12 @@ def _get_paths(bids_paths): if os.path.isfile(bids_path): path_list.append(bids_path) continue - for root, dirs, file_names in os.walk(bids_path, topdown=False): - if any(root.endswith(i) for i in treat_as_file_suffix): - continue - if any(f"{i}/" in root for i in treat_as_file_suffix): - continue - if any(f"{i}\\" in root for i in treat_as_file_suffix): - continue + for root, dirs, file_names in os.walk(bids_path, topdown=True): + if any(root.endswith(i) for i in pseudofile_suffixes): + # Add the directory name to the validation paths list. + path_list.append(f"{root}/") + # Do not index the contents of the directory. + dirs[:] = [] # will break if BIDS ever puts meaningful data under `/.{dandi,datalad,git}*/` if any(exclude_subdir in root for exclude_subdir in exclude_subdirs): continue @@ -335,6 +337,8 @@ def load_all( ------- all_regex : list of dict A list of dictionaries, with keys including 'regex' and 'mandatory'. + my_schema : list of dict + Nested dictionaries representing the full schema. """ my_schema = schema.load_schema(schema_dir) @@ -346,13 +350,14 @@ def load_all( ) all_regex.extend(top_level_regex) - return all_regex + return all_regex, my_schema def validate_all( bids_paths, regex_schema, debug=False, + pseudofile_suffixes=[], ): """ Validate `bids_paths` based on a `regex_schema` dictionary list, including regexes. @@ -366,6 +371,11 @@ def validate_all( debug : tuple, optional Whether to print itemwise notices for checks on the console, and include them in the validation result. + pseudofile_suffixes : list of str, optional + Any suffixes which identify BIDS-valid directory data. + These pseudo-file suffixes will be validated based on the directory name, with the + directory contents not being indexed for validation. + By default, no pseudo-file suffixes are checked. Returns ------- @@ -384,7 +394,7 @@ def validate_all( """ tracking_schema = deepcopy(regex_schema) - paths_list = _get_paths(bids_paths) + paths_list = _get_paths(bids_paths, pseudofile_suffixes=pseudofile_suffixes) tracking_paths = deepcopy(paths_list) if debug: itemwise_results = [] @@ -658,6 +668,34 @@ def log_errors(validation_result): lgr.warning("The `%s` file was not matched by any regex schema entry.", i) +def _get_directory_suffixes(my_schema): + """Query schema for suffixes which identify directory entities. + + Parameters + ---------- + my_schema : dict + Nested directory as produced by `schemacode.schema.load_schema()`. + + Returns + ------- + list of str + Directory pseudofile suffixes excluding trailing slashes. + + Notes + ----- + * Yes this seems super-awkward to do explicitly, after all, the trailing slash is + already in so it should automagically work, but no: + - Subdirectory names need to be dynamically excluded from validation input. + - Backslash directory delimiters are still in use, which is regrettable. + """ + pseudofile_suffixes = [] + for i in my_schema["objects"]["extensions"]: + if i.endswith("/"): + if i != "/": + pseudofile_suffixes.append(i[:-1]) + return pseudofile_suffixes + + def validate_bids( bids_paths, schema_reference_root="/usr/share/bids-schema/", @@ -716,11 +754,13 @@ def validate_bids( bids_paths = [bids_paths] bids_schema_dir = select_schema_dir(bids_paths, schema_reference_root, schema_version) - regex_schema = load_all(bids_schema_dir) + regex_schema, my_schema = load_all(bids_schema_dir) + pseudofile_suffixes = _get_directory_suffixes(my_schema) validation_result = validate_all( bids_paths, regex_schema, debug=debug, + pseudofile_suffixes=pseudofile_suffixes, ) log_errors(validation_result)