dandi · yarikoptic · Dec 12, 2022 · Nov 23, 2022 · Nov 23, 2022 · Nov 28, 2022
diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py
@@ -5,6 +5,7 @@
 import click
 
 from .base import devel_option, lgr, map_to_click_exceptions
+from ..consts import ZARR_EXTENSIONS, metadata_all_fields
 from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
 from ..misctypes import Digest
 from ..utils import is_url
@@ -92,7 +93,6 @@ def ls(
         PYOUTFormatter,
         YAMLFormatter,
     )
-    from ..consts import metadata_all_fields
 
     # TODO: avoid
     from ..support.pyout import PYOUT_SHORT_NAMES_rev
@@ -358,7 +358,20 @@ def fn():
                             digest=Digest.dandi_etag(digest),
                         ).json_dict()
                 else:
-                    rec = get_metadata(path)
+                    if path.endswith(tuple(ZARR_EXTENSIONS)):
+                        if use_fake_digest:
+                            digest = "0" * 32 + "-0--0"
+                        else:
+                            lgr.info("Calculating digest for %s", path)
+                            digest = get_digest(path, digest="zarr-checksum")
+                        rec = get_metadata(path, Digest.dandi_zarr(digest))
+                    else:
+                        if use_fake_digest:
+                            digest = "0" * 32 + "-1"
+                        else:
+                            lgr.info("Calculating digest for %s", path)
+                            digest = get_digest(path, digest="dandi-etag")
+                        rec = get_metadata(path, Digest.dandi_etag(digest))
             except Exception as exc:
                 _add_exc_error(path, rec, errors, exc)
             if flatten:

diff --git a/dandi/cli/tests/test_ls.py → dandi/cli/tests/test_cmd_ls.py b/dandi/cli/tests/test_ls.py → dandi/cli/tests/test_cmd_ls.py
@@ -49,16 +49,37 @@ def load(s):
         assert metadata[f] == simple1_nwb_metadata[f]
 
 
+def test_ls_nwb_file(simple2_nwb):
+    bids_file_path = "simple2.nwb"
+    bids_file_path = os.path.join(simple2_nwb, bids_file_path)
+    r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
+    assert r.exit_code == 0, r.output
+    data = yaml_load(r.stdout, "safe")
+    assert len(data) == 1
+
+
 @mark.skipif_no_network
-@pytest.mark.xfail(reason="https://github.com/dandi/dandi-cli/issues/1097")
 def test_ls_bids_file(bids_examples):
     bids_file_path = "asl003/sub-Sub1/anat/sub-Sub1_T1w.nii.gz"
     bids_file_path = os.path.join(bids_examples, bids_file_path)
     r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
     assert r.exit_code == 0, r.output
     data = yaml_load(r.stdout, "safe")
     assert len(data) == 1
-    assert data[0]["subject_id"] == "Sub1"
+    assert data[0]["identifier"] == "Sub1"
+
+
+@mark.skipif_no_network
+def test_ls_zarrbids_file(bids_examples):
+    bids_file_path = (
+        "micr_SEMzarr/sub-01/ses-01/micr/sub-01_ses-01_sample-A_SPIM.ome.zarr"
+    )
+    bids_file_path = os.path.join(bids_examples, bids_file_path)
+    r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
+    assert r.exit_code == 0, r.output
+    data = yaml_load(r.stdout, "safe")
+    assert len(data) == 1
+    assert data[0]["identifier"] == "01"
 
 
 @mark.skipif_no_network

diff --git a/dandi/files/bids.py b/dandi/files/bids.py
@@ -12,6 +12,7 @@
 
 from .bases import GenericAsset, LocalFileAsset, NWBAsset
 from .zarr import ZarrAsset
+from ..consts import ZARR_MIME_TYPE
 from ..metadata import add_common_metadata, prepare_metadata
 from ..misctypes import Digest
 from ..validate_types import ValidationResult
@@ -45,6 +46,12 @@ class BIDSDatasetDescriptionAsset(LocalFileAsset):
     #: populated by `_validate()`
     _asset_metadata: Optional[dict[str, dict[str, Any]]] = None
 
+    #: Version of BIDS used for the validation;
+    #: populated by `_validate()`
+    #: In future this might be removed and the information included in the
+    #: BareAsset via dandischema.
+    _bids_version: Optional[str] = None
+
     #: Threading lock needed in case multiple assets are validated in parallel
     #: during upload
     _lock: Lock = field(init=False, default_factory=Lock, repr=False, compare=False)
@@ -65,6 +72,22 @@ def _validate(self) -> None:
                 bids_paths = [str(self.filepath)] + [
                     str(asset.filepath) for asset in self.dataset_files
                 ]
+                # This is an ad-hoc fix which should be removed once bidsschematools greater than
+                # 0.6.0 is released.
+                # It won't cause any trouble afterwards, but it will no longer fulfill any
+                # purpose. The issue is that README* is still required and if we don't
+                # include it explicitly in the listing validation will implicitly fail, even
+                # if the file is present.
+                readme_extensions = ["", ".md", ".rst", ".txt"]
+                for ext in readme_extensions:
+                    readme_candidate = self.bids_root / Path("README" + ext)
+                    if (
+                        readme_candidate.exists()
+                        and str(readme_candidate) not in bids_paths
+                    ):
+                        bids_paths += [str(readme_candidate)]
+                # end of ad-hoc fix.
+
                 results = validate_bids(*bids_paths)
                 self._dataset_errors: list[ValidationResult] = []
                 self._asset_errors: dict[str, list[ValidationResult]] = defaultdict(
@@ -74,7 +97,8 @@ def _validate(self) -> None:
                 for result in results:
                     if result.id in BIDS_ASSET_ERRORS:
                         assert result.path
-                        self._asset_errors[str(result.path)].append(result)
+                        bids_path = result.path.relative_to(self.bids_root).as_posix()
+                        self._asset_errors[bids_path].append(result)
                     elif result.id in BIDS_DATASET_ERRORS:
                         self._dataset_errors.append(result)
                     elif result.id == "BIDS.MATCH":
@@ -84,6 +108,7 @@ def _validate(self) -> None:
                         self._asset_metadata[bids_path] = prepare_metadata(
                             result.metadata
                         )
+                        self._bids_version = result.origin.bids_version
 
     def get_asset_errors(self, asset: BIDSAsset) -> list[ValidationResult]:
         """:meta private:"""
@@ -174,6 +199,11 @@ def get_metadata(
         metadata["path"] = self.path
         return BareAsset(**metadata)
 
+    def get_validation_bids_version(self) -> str:
+        self.bids_dataset_description._validate()
+        assert self.bids_dataset_description._bids_version is not None
+        return self.bids_dataset_description._bids_version
+
 
 class NWBBIDSAsset(BIDSAsset, NWBAsset):
     """
@@ -219,6 +249,18 @@ def get_validation_errors(
             self, schema_version, devel_debug
         ) + BIDSAsset.get_validation_errors(self)
 
+    def get_metadata(
+        self,
+        digest: Optional[Digest] = None,
+        ignore_errors: bool = True,
+    ) -> BareAsset:
+        metadata = self.bids_dataset_description.get_asset_metadata(self)
+        start_time = end_time = datetime.now().astimezone()
+        add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
+        metadata["path"] = self.path
+        metadata["encodingFormat"] = ZARR_MIME_TYPE
+        return BareAsset(**metadata)
+
 
 class GenericBIDSAsset(BIDSAsset, GenericAsset):
     """

diff --git a/dandi/metadata.py b/dandi/metadata.py
@@ -28,6 +28,7 @@
 import tenacity
 
 from . import __version__, get_logger
+from .consts import ZARR_EXTENSIONS, metadata_all_fields
 from .dandiset import Dandiset
 from .misctypes import Digest
 from .pynwb_utils import (
@@ -38,14 +39,22 @@
     metadata_cache,
     nwb_has_external_links,
 )
-from .utils import ensure_datetime, get_mime_type, get_utcnow_datetime
+from .utils import (
+    ensure_datetime,
+    find_parent_directory_containing,
+    get_mime_type,
+    get_utcnow_datetime,
+)
 
 lgr = get_logger()
 
 
 # Disable this for clean hacking
 @metadata_cache.memoize_path
-def get_metadata(path: Union[str, Path]) -> Optional[dict]:
+def get_metadata(
+    path: Union[str, Path],
+    digest: Optional[Digest] = None,
+) -> Optional[dict]:
     """
     Get "flatdata" from a .nwb file or a Dandiset directory
 
@@ -59,64 +68,90 @@ def get_metadata(path: Union[str, Path]) -> Optional[dict]:
     -------
     dict
     """
+
+    from .files import bids, dandi_file, find_bids_dataset_description
+
     # when we run in parallel, these annoying warnings appear
     ignore_benign_pynwb_warnings()
-    path = str(path)  # for Path
+    path = os.path.abspath(str(path))  # for Path
     meta = dict()
 
-    if op.isdir(path):
+    if op.isdir(path) and not path.endswith(tuple(ZARR_EXTENSIONS)):
         try:
             dandiset = Dandiset(path)
             return cast(dict, dandiset.metadata)
         except ValueError as exc:
             lgr.debug("Failed to get metadata for %s: %s", path, exc)
             return None
 
-    if nwb_has_external_links(path):
-        raise NotImplementedError(
-            f"NWB files with external links are not supported: {path}"
+    # Is the data BIDS (as defined by the presence of a BIDS dataset descriptor)
+    bids_dataset_description = find_bids_dataset_description(path)
+    if bids_dataset_description:
+        dandiset_path = find_parent_directory_containing("dandiset.yaml", path)
+        bids_dataset_description = find_bids_dataset_description(path)
+        df = dandi_file(
+            Path(path),
+            dandiset_path,
+            bids_dataset_description=bids_dataset_description,
         )
+        path_metadata = df.get_metadata(digest=digest)
+        assert isinstance(df, bids.BIDSAsset)
+        meta["bids_version"] = df.get_validation_bids_version()
+        # there might be a more elegant way to do this:
+        for key in metadata_all_fields:
+            try:
+                value = getattr(path_metadata.wasAttributedTo[0], key)
+            except AttributeError:
+                pass
+            else:
+                meta[key] = value
+    elif path.endswith((".NWB", ".nwb")):
-    elif path.endswith((".NWB", ".nwb")):
+    if path.endswith((".NWB", ".nwb")):
-    elif path.endswith((".NWB", ".nwb")):
+    if path.endswith((".NWB", ".nwb")):
+        if nwb_has_external_links(path):
+            raise NotImplementedError(
+                f"NWB files with external links are not supported: {path}"
+            )
 
-    # First read out possibly available versions of specifications for NWB(:N)
-    meta["nwb_version"] = get_nwb_version(path)
-
-    # PyNWB might fail to load because of missing extensions.
-    # There is a new initiative of establishing registry of such extensions.
-    # Not yet sure if PyNWB is going to provide "native" support for needed
-    # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
-    # So meanwhile, hard-coded workaround for data types we care about
-    ndtypes_registry = {
-        "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
-        "ndx-labmetadata-abf": "ndx_dandi_icephys",
-    }
-    tried_imports = set()
-    while True:
-        try:
-            meta.update(_get_pynwb_metadata(path))
-            break
-        except KeyError as exc:  # ATM there is
-            lgr.debug("Failed to read %s: %s", path, exc)
-            res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
-            if not res:
-                raise
-            ndtype = res.groups()[0]
-            if ndtype not in ndtypes_registry:
-                raise ValueError(
-                    "We do not know which extension provides %s. "
-                    "Original exception was: %s. " % (ndtype, exc)
-                )
-            import_mod = ndtypes_registry[ndtype]
-            lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
-            if import_mod in tried_imports:
-                raise RuntimeError(
-                    "We already tried importing %s to provide %s, but it seems it didn't help"
-                    % (import_mod, ndtype)
-                )
-            tried_imports.add(import_mod)
-            __import__(import_mod)
-
-    meta["nd_types"] = get_neurodata_types(path)
-
+        # First read out possibly available versions of specifications for NWB(:N)
+        meta["nwb_version"] = get_nwb_version(path)
+
+        # PyNWB might fail to load because of missing extensions.
+        # There is a new initiative of establishing registry of such extensions.
+        # Not yet sure if PyNWB is going to provide "native" support for needed
+        # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
+        # So meanwhile, hard-coded workaround for data types we care about
+        ndtypes_registry = {
+            "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
+            "ndx-labmetadata-abf": "ndx_dandi_icephys",
+        }
+        tried_imports = set()
+        while True:
+            try:
+                meta.update(_get_pynwb_metadata(path))
+                break
+            except KeyError as exc:  # ATM there is
+                lgr.debug("Failed to read %s: %s", path, exc)
+                res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
+                if not res:
+                    raise
+                ndtype = res.groups()[0]
+                if ndtype not in ndtypes_registry:
+                    raise ValueError(
+                        "We do not know which extension provides %s. "
+                        "Original exception was: %s. " % (ndtype, exc)
+                    )
+                import_mod = ndtypes_registry[ndtype]
+                lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
+                if import_mod in tried_imports:
+                    raise RuntimeError(
+                        "We already tried importing %s to provide %s, but it seems it didn't help"
+                        % (import_mod, ndtype)
+                    )
+                tried_imports.add(import_mod)
+                __import__(import_mod)
+
+        meta["nd_types"] = get_neurodata_types(path)
+    else:
+        raise RuntimeError("Unable to get metadata from non-BIDS, non-NWB asset.")
     return meta
 
 
@@ -949,6 +984,12 @@ def add_common_metadata(
             "mtime %s of %s is in the future", metadata["blobDateModified"], path
         )
     metadata["contentSize"] = os.path.getsize(path)
+    if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum:
+        m = re.fullmatch(
+            r"(?P<hash>[0-9a-f]{32})-(?P<files>[0-9]+)--(?P<size>[0-9]+)", digest.value
+        )
+        if m:
+            metadata["contentSize"] = int(m["size"])
     metadata.setdefault("wasGeneratedBy", []).append(
         get_generator(start_time, end_time)
     )

diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py
@@ -220,7 +220,7 @@ def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> No
     # Check existence of assets:
     dandiset = bids_dandiset.dandiset
     # file we created?
-    dandiset.get_asset_by_path("CHANGES")
+    dandiset.get_asset_by_path("README")
     # BIDS descriptor file?
     dandiset.get_asset_by_path("dataset_description.json")
     # actual data file?

diff --git a/dandi/validate.py b/dandi/validate.py
@@ -68,6 +68,7 @@ def validate_bids(
     origin = ValidationOrigin(
         name="bidsschematools",
         version=bidsschematools.__version__,
+        bids_version=validation_result["bids_version"],
     )
 
     # Storing variable to not re-compute set paths for each individual file.

diff --git a/dandi/validate_types.py b/dandi/validate_types.py
@@ -8,6 +8,7 @@
 class ValidationOrigin:
     name: str
     version: str
+    bids_version: Optional[str] = None
 
 
 class Severity(Enum):