Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests for ls reinstated, underlying function fixed, support for ZARR-BIDS files added. #1164

Merged
merged 37 commits into from
Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
d3ec5f1
Unravelling `ls` issues
TheChymera Nov 23, 2022
dc34ab0
strange `Cannot creat weak reference to PosixPath object`
TheChymera Nov 23, 2022
8aef541
Fixed weakref issue
TheChymera Nov 28, 2022
b3a4b9a
Added readme listing and removed some diagnostic print calls
TheChymera Nov 28, 2022
3272c86
Removed more debugging print calls
TheChymera Nov 28, 2022
ba99146
Removed more debugging print calls
TheChymera Nov 28, 2022
64677c1
Creating and passing digests
TheChymera Nov 28, 2022
f8a0da8
Added validation bids schema version to result object metadata
TheChymera Nov 28, 2022
430fb06
Assigning bids schema version for validation to metadata dict
TheChymera Nov 28, 2022
279ecf9
Trying to insert bids_schema_version ... somewhere.
TheChymera Nov 29, 2022
60b2408
Added (get_validation)_bids_version to BIDS assets
TheChymera Nov 29, 2022
4a4530e
Creating ZARR digest for ZARR files
TheChymera Nov 30, 2022
9df52ad
Using correct ZARR digest class (still not working)
TheChymera Dec 1, 2022
0e0ff8e
More debugging print calls trying to track down digest error
TheChymera Dec 1, 2022
cc28fbb
Removed unneeded import
TheChymera Dec 1, 2022
5d59ae8
Using zarr extensions variable from constants module
TheChymera Dec 1, 2022
5d62fcc
Reinstated fake digest after introducing ZARR conditional
TheChymera Dec 1, 2022
f0de103
Attempting to fix digest type conflict (still not working)
TheChymera Dec 1, 2022
86ae9c0
Use Zarr checksum to set metadata contentSize
jwodder Dec 1, 2022
9866d0c
Removed debugging print calls
TheChymera Dec 2, 2022
785dec0
Not requiring bids_version for ValidatorOrigin
TheChymera Dec 2, 2022
2059dd7
Typing fixes and improved variable name
TheChymera Dec 2, 2022
a2119ec
Attemting type check fix for missing attribute
TheChymera Dec 2, 2022
722e804
Removed commented imports
TheChymera Dec 2, 2022
3649b5e
Actually fix typing
jwodder Dec 2, 2022
32d0379
Debugging upload tests
TheChymera Dec 2, 2022
ba7d60c
Reinstating test
TheChymera Dec 5, 2022
e8a64e1
Associate BIDS asset errors with the actual assets
jwodder Dec 5, 2022
ccda9ec
Fixed novel README validation error
TheChymera Dec 5, 2022
44a8ab4
Checking ls output subject identifier
TheChymera Dec 5, 2022
efab43e
Added ls command tests for zarrBIDS file
TheChymera Dec 5, 2022
1ed0f5f
ZARR-appropriate fake checksum
TheChymera Dec 6, 2022
9900841
More compact code
TheChymera Dec 6, 2022
3384ef5
Dot in extension
TheChymera Dec 6, 2022
d9ccf8b
Safer assert satatement
TheChymera Dec 6, 2022
375947d
Making sure the method which sets the attribute has been run
TheChymera Dec 6, 2022
11c18f5
Updated logic to better accommodate future BIDS NWB examples
TheChymera Dec 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions dandi/cli/cmd_ls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import click

from .base import devel_option, lgr, map_to_click_exceptions
from ..consts import ZARR_EXTENSIONS, metadata_all_fields
from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from ..misctypes import Digest
from ..utils import is_url
Expand Down Expand Up @@ -92,7 +93,6 @@ def ls(
PYOUTFormatter,
YAMLFormatter,
)
from ..consts import metadata_all_fields

# TODO: avoid
from ..support.pyout import PYOUT_SHORT_NAMES_rev
Expand Down Expand Up @@ -358,7 +358,20 @@ def fn():
digest=Digest.dandi_etag(digest),
).json_dict()
else:
rec = get_metadata(path)
if path.endswith(tuple(ZARR_EXTENSIONS)):
if use_fake_digest:
digest = "0" * 32 + "-0--0"
else:
lgr.info("Calculating digest for %s", path)
digest = get_digest(path, digest="zarr-checksum")
rec = get_metadata(path, Digest.dandi_zarr(digest))
else:
if use_fake_digest:
digest = "0" * 32 + "-1"
else:
lgr.info("Calculating digest for %s", path)
digest = get_digest(path, digest="dandi-etag")
rec = get_metadata(path, Digest.dandi_etag(digest))
except Exception as exc:
_add_exc_error(path, rec, errors, exc)
if flatten:
Expand Down
25 changes: 23 additions & 2 deletions dandi/cli/tests/test_ls.py → dandi/cli/tests/test_cmd_ls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,37 @@ def load(s):
assert metadata[f] == simple1_nwb_metadata[f]


def test_ls_nwb_file(simple2_nwb):
bids_file_path = "simple2.nwb"
bids_file_path = os.path.join(simple2_nwb, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1


@mark.skipif_no_network
@pytest.mark.xfail(reason="https://github.com/dandi/dandi-cli/issues/1097")
def test_ls_bids_file(bids_examples):
bids_file_path = "asl003/sub-Sub1/anat/sub-Sub1_T1w.nii.gz"
bids_file_path = os.path.join(bids_examples, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1
assert data[0]["subject_id"] == "Sub1"
assert data[0]["identifier"] == "Sub1"


@mark.skipif_no_network
def test_ls_zarrbids_file(bids_examples):
bids_file_path = (
"micr_SEMzarr/sub-01/ses-01/micr/sub-01_ses-01_sample-A_SPIM.ome.zarr"
)
bids_file_path = os.path.join(bids_examples, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1
assert data[0]["identifier"] == "01"


@mark.skipif_no_network
Expand Down
44 changes: 43 additions & 1 deletion dandi/files/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from .bases import GenericAsset, LocalFileAsset, NWBAsset
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE
from ..metadata import add_common_metadata, prepare_metadata
from ..misctypes import Digest
from ..validate_types import ValidationResult
Expand Down Expand Up @@ -45,6 +46,12 @@ class BIDSDatasetDescriptionAsset(LocalFileAsset):
#: populated by `_validate()`
_asset_metadata: Optional[dict[str, dict[str, Any]]] = None

#: Version of BIDS used for the validation;
#: populated by `_validate()`
#: In future this might be removed and the information included in the
#: BareAsset via dandischema.
_bids_version: Optional[str] = None

#: Threading lock needed in case multiple assets are validated in parallel
#: during upload
_lock: Lock = field(init=False, default_factory=Lock, repr=False, compare=False)
Expand All @@ -65,6 +72,22 @@ def _validate(self) -> None:
bids_paths = [str(self.filepath)] + [
str(asset.filepath) for asset in self.dataset_files
]
# This is an ad-hoc fix which should be removed once bidsschematools greater than
# 0.6.0 is released.
# It won't cause any trouble afterwards, but it will no longer fulfill any
# purpose. The issue is that README* is still required and if we don't
# include it explicitly in the listing validation will implicitly fail, even
# if the file is present.
readme_extensions = ["", ".md", ".rst", ".txt"]
for ext in readme_extensions:
readme_candidate = self.bids_root / Path("README" + ext)
if (
readme_candidate.exists()
and str(readme_candidate) not in bids_paths
):
bids_paths += [str(readme_candidate)]
# end of ad-hoc fix.

results = validate_bids(*bids_paths)
self._dataset_errors: list[ValidationResult] = []
self._asset_errors: dict[str, list[ValidationResult]] = defaultdict(
Expand All @@ -74,7 +97,8 @@ def _validate(self) -> None:
for result in results:
if result.id in BIDS_ASSET_ERRORS:
assert result.path
self._asset_errors[str(result.path)].append(result)
bids_path = result.path.relative_to(self.bids_root).as_posix()
self._asset_errors[bids_path].append(result)
elif result.id in BIDS_DATASET_ERRORS:
self._dataset_errors.append(result)
elif result.id == "BIDS.MATCH":
Expand All @@ -84,6 +108,7 @@ def _validate(self) -> None:
self._asset_metadata[bids_path] = prepare_metadata(
result.metadata
)
self._bids_version = result.origin.bids_version

def get_asset_errors(self, asset: BIDSAsset) -> list[ValidationResult]:
""":meta private:"""
Expand Down Expand Up @@ -174,6 +199,11 @@ def get_metadata(
metadata["path"] = self.path
return BareAsset(**metadata)

def get_validation_bids_version(self) -> str:
self.bids_dataset_description._validate()
assert self.bids_dataset_description._bids_version is not None
return self.bids_dataset_description._bids_version


class NWBBIDSAsset(BIDSAsset, NWBAsset):
"""
Expand Down Expand Up @@ -219,6 +249,18 @@ def get_validation_errors(
self, schema_version, devel_debug
) + BIDSAsset.get_validation_errors(self)

def get_metadata(
self,
digest: Optional[Digest] = None,
ignore_errors: bool = True,
) -> BareAsset:
metadata = self.bids_dataset_description.get_asset_metadata(self)
start_time = end_time = datetime.now().astimezone()
add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
metadata["path"] = self.path
metadata["encodingFormat"] = ZARR_MIME_TYPE
return BareAsset(**metadata)


class GenericBIDSAsset(BIDSAsset, GenericAsset):
"""
Expand Down
135 changes: 88 additions & 47 deletions dandi/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import tenacity

from . import __version__, get_logger
from .consts import ZARR_EXTENSIONS, metadata_all_fields
from .dandiset import Dandiset
from .misctypes import Digest
from .pynwb_utils import (
Expand All @@ -38,14 +39,22 @@
metadata_cache,
nwb_has_external_links,
)
from .utils import ensure_datetime, get_mime_type, get_utcnow_datetime
from .utils import (
ensure_datetime,
find_parent_directory_containing,
get_mime_type,
get_utcnow_datetime,
)

lgr = get_logger()


# Disable this for clean hacking
@metadata_cache.memoize_path
def get_metadata(path: Union[str, Path]) -> Optional[dict]:
def get_metadata(
path: Union[str, Path],
digest: Optional[Digest] = None,
) -> Optional[dict]:
"""
Get "flatdata" from a .nwb file or a Dandiset directory

Expand All @@ -59,64 +68,90 @@ def get_metadata(path: Union[str, Path]) -> Optional[dict]:
-------
dict
"""

from .files import bids, dandi_file, find_bids_dataset_description
TheChymera marked this conversation as resolved.
Show resolved Hide resolved

# when we run in parallel, these annoying warnings appear
ignore_benign_pynwb_warnings()
path = str(path) # for Path
path = os.path.abspath(str(path)) # for Path
meta = dict()

if op.isdir(path):
if op.isdir(path) and not path.endswith(tuple(ZARR_EXTENSIONS)):
try:
dandiset = Dandiset(path)
return cast(dict, dandiset.metadata)
except ValueError as exc:
lgr.debug("Failed to get metadata for %s: %s", path, exc)
return None

if nwb_has_external_links(path):
raise NotImplementedError(
f"NWB files with external links are not supported: {path}"
# Is the data BIDS (as defined by the presence of a BIDS dataset descriptor)
bids_dataset_description = find_bids_dataset_description(path)
if bids_dataset_description:
dandiset_path = find_parent_directory_containing("dandiset.yaml", path)
bids_dataset_description = find_bids_dataset_description(path)
df = dandi_file(
Path(path),
dandiset_path,
bids_dataset_description=bids_dataset_description,
)
path_metadata = df.get_metadata(digest=digest)
assert isinstance(df, bids.BIDSAsset)
meta["bids_version"] = df.get_validation_bids_version()
# there might be a more elegant way to do this:
for key in metadata_all_fields:
try:
value = getattr(path_metadata.wasAttributedTo[0], key)
except AttributeError:
pass
else:
meta[key] = value
elif path.endswith((".NWB", ".nwb")):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif path.endswith((".NWB", ".nwb")):
if path.endswith((".NWB", ".nwb")):

in relation to prior comment #1164 (comment) -- the point is to validate .nwb regardless either they are part of BIDS or not. BIDS validation does not do any validation of .nwb files which could be within BIDS dataset.

if nwb_has_external_links(path):
raise NotImplementedError(
f"NWB files with external links are not supported: {path}"
)

# First read out possibly available versions of specifications for NWB(:N)
meta["nwb_version"] = get_nwb_version(path)

# PyNWB might fail to load because of missing extensions.
# There is a new initiative of establishing registry of such extensions.
# Not yet sure if PyNWB is going to provide "native" support for needed
# functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
# So meanwhile, hard-coded workaround for data types we care about
ndtypes_registry = {
"AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
"ndx-labmetadata-abf": "ndx_dandi_icephys",
}
tried_imports = set()
while True:
try:
meta.update(_get_pynwb_metadata(path))
break
except KeyError as exc: # ATM there is
lgr.debug("Failed to read %s: %s", path, exc)
res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
if not res:
raise
ndtype = res.groups()[0]
if ndtype not in ndtypes_registry:
raise ValueError(
"We do not know which extension provides %s. "
"Original exception was: %s. " % (ndtype, exc)
)
import_mod = ndtypes_registry[ndtype]
lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
if import_mod in tried_imports:
raise RuntimeError(
"We already tried importing %s to provide %s, but it seems it didn't help"
% (import_mod, ndtype)
)
tried_imports.add(import_mod)
__import__(import_mod)

meta["nd_types"] = get_neurodata_types(path)

# First read out possibly available versions of specifications for NWB(:N)
meta["nwb_version"] = get_nwb_version(path)

# PyNWB might fail to load because of missing extensions.
# There is a new initiative of establishing registry of such extensions.
# Not yet sure if PyNWB is going to provide "native" support for needed
# functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
# So meanwhile, hard-coded workaround for data types we care about
ndtypes_registry = {
"AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
"ndx-labmetadata-abf": "ndx_dandi_icephys",
}
tried_imports = set()
while True:
try:
meta.update(_get_pynwb_metadata(path))
break
except KeyError as exc: # ATM there is
lgr.debug("Failed to read %s: %s", path, exc)
res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
if not res:
raise
ndtype = res.groups()[0]
if ndtype not in ndtypes_registry:
raise ValueError(
"We do not know which extension provides %s. "
"Original exception was: %s. " % (ndtype, exc)
)
import_mod = ndtypes_registry[ndtype]
lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
if import_mod in tried_imports:
raise RuntimeError(
"We already tried importing %s to provide %s, but it seems it didn't help"
% (import_mod, ndtype)
)
tried_imports.add(import_mod)
__import__(import_mod)

meta["nd_types"] = get_neurodata_types(path)
else:
raise RuntimeError("Unable to get metadata from non-BIDS, non-NWB asset.")
return meta


Expand Down Expand Up @@ -949,6 +984,12 @@ def add_common_metadata(
"mtime %s of %s is in the future", metadata["blobDateModified"], path
)
metadata["contentSize"] = os.path.getsize(path)
if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum:
m = re.fullmatch(
r"(?P<hash>[0-9a-f]{32})-(?P<files>[0-9]+)--(?P<size>[0-9]+)", digest.value
)
if m:
metadata["contentSize"] = int(m["size"])
metadata.setdefault("wasGeneratedBy", []).append(
get_generator(start_time, end_time)
)
Expand Down
2 changes: 1 addition & 1 deletion dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> No
# Check existence of assets:
dandiset = bids_dandiset.dandiset
# file we created?
dandiset.get_asset_by_path("CHANGES")
dandiset.get_asset_by_path("README")
# BIDS descriptor file?
dandiset.get_asset_by_path("dataset_description.json")
# actual data file?
Expand Down
1 change: 1 addition & 0 deletions dandi/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def validate_bids(
origin = ValidationOrigin(
name="bidsschematools",
version=bidsschematools.__version__,
bids_version=validation_result["bids_version"],
)

# Storing variable to not re-compute set paths for each individual file.
Expand Down
1 change: 1 addition & 0 deletions dandi/validate_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class ValidationOrigin:
name: str
version: str
bids_version: Optional[str] = None


class Severity(Enum):
Expand Down