Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validation of filepaths for non-BIDS NWB assets #1173

Merged
merged 2 commits into from
Dec 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions dandi/cli/tests/test_cmd_validate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from pathlib import Path

from click.testing import CliRunner
import pytest
Expand Down Expand Up @@ -50,17 +51,16 @@ def test_validate_bids_grouping_error(bids_error_examples, dataset="invalid_asl0
assert dataset in r.output


def test_validate_nwb_path_grouping(simple4_nwb):
def test_validate_nwb_path_grouping(organized_nwb_dir3: Path) -> None:
"""
This is currently a placeholder test, and should be updated once we have paths with
multiple errors for which grouping functionality can actually be tested.
This is currently a placeholder test and should be updated once we have
paths with multiple errors for which grouping functionality can actually be
tested.
"""

r = CliRunner().invoke(validate, ["--grouping=path", simple4_nwb])
r = CliRunner().invoke(validate, ["--grouping=path", str(organized_nwb_dir3)])
assert r.exit_code == 0

# Does it give required warnings for required path?
assert simple4_nwb in r.output
assert str(organized_nwb_dir3 / "sub-mouse001" / "sub-mouse001.nwb") in r.output
assert "NWBI.check_data_orientation" in r.output


Expand Down
4 changes: 2 additions & 2 deletions dandi/cli/tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@


@pytest.mark.parametrize("command", (ls, validate))
def test_smoke(simple2_nwb, command):
def test_smoke(organized_nwb_dir, command):
runner = CliRunner()
r = runner.invoke(command, [simple2_nwb])
r = runner.invoke(command, [str(organized_nwb_dir)])
assert r.exit_code == 0, f"Exited abnormally. out={r.stdout}"
assert r.stdout, "There were no output whatsoever"

Expand Down
5 changes: 3 additions & 2 deletions dandi/files/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,18 +175,19 @@ def dandi_file(
"""
filepath = Path(filepath)
if dandiset_path is not None:
dandiset_path = Path(dandiset_path)
path = filepath.relative_to(dandiset_path).as_posix()
if path == ".":
raise ValueError("Dandi file path cannot equal Dandiset path")
else:
path = filepath.name
if filepath.is_file() and path == dandiset_metadata_file:
return DandisetMetadataFile(filepath=filepath)
return DandisetMetadataFile(filepath=filepath, dandiset_path=dandiset_path)
if bids_dataset_description is None:
factory = DandiFileFactory()
else:
factory = BIDSFileFactory(bids_dataset_description)
return factory(filepath, path)
return factory(filepath, path, dandiset_path)


def find_bids_dataset_description(
Expand Down
19 changes: 13 additions & 6 deletions dandi/files/_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import ClassVar
from typing import ClassVar, Optional
import weakref

from dandi.consts import (
Expand Down Expand Up @@ -65,9 +65,11 @@ class DandiFileFactory:
DandiFileType.BIDS_DATASET_DESCRIPTION: BIDSDatasetDescriptionAsset,
}

def __call__(self, filepath: Path, path: str) -> DandiFile:
def __call__(
self, filepath: Path, path: str, dandiset_path: Optional[Path]
) -> DandiFile:
return self.CLASSES[DandiFileType.classify(filepath)](
filepath=filepath, path=path
filepath=filepath, path=path, dandiset_path=dandiset_path
)


Expand All @@ -84,16 +86,21 @@ class BIDSFileFactory(DandiFileFactory):
DandiFileType.GENERIC: GenericBIDSAsset,
}

def __call__(self, filepath: Path, path: str) -> DandiFile:
def __call__(
self, filepath: Path, path: str, dandiset_path: Optional[Path]
) -> DandiFile:
ftype = DandiFileType.classify(filepath)
if ftype is DandiFileType.BIDS_DATASET_DESCRIPTION:
if filepath == self.bids_dataset_description.filepath:
return self.bids_dataset_description
else:
return BIDSDatasetDescriptionAsset(filepath=filepath, path=path)
return BIDSDatasetDescriptionAsset(
filepath=filepath, path=path, dandiset_path=dandiset_path
)
df = self.CLASSES[ftype](
filepath=filepath,
path=path,
dandiset_path=dandiset_path,
bids_dataset_description_ref=weakref.ref(self.bids_dataset_description),
)
self.bids_dataset_description.dataset_files.append(df)
Expand All @@ -102,5 +109,5 @@ def __call__(self, filepath: Path, path: str) -> DandiFile:

def is_empty_zarr(path: Path) -> bool:
""":meta private:"""
zf = ZarrAsset(filepath=path, path=path.name)
zf = ZarrAsset(filepath=path, path=path.name, dandiset_path=None)
return not any(zf.iterfiles())
31 changes: 29 additions & 2 deletions dandi/files/bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from dandi.dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient
from dandi.metadata import get_default_metadata, nwb2asset
from dandi.misctypes import DUMMY_DIGEST, Digest, P
from dandi.organize import validate_organized_path

Check notice

Code scanning / CodeQL

Cyclic import

Import of module [dandi.organize](1) begins an import cycle.
from dandi.pynwb_utils import validate as pynwb_validate
from dandi.support.digests import get_dandietag, get_digest
from dandi.utils import yaml_load
Expand Down Expand Up @@ -56,6 +57,9 @@ class DandiFile(ABC):
#: The path to the actual file or directory on disk
filepath: Path

#: The path to the root of the Dandiset, if there is one
dandiset_path: Optional[Path]

@property
def size(self) -> int:
"""The size of the file"""
Expand Down Expand Up @@ -190,7 +194,7 @@ def get_validation_errors(
path=self.filepath, # note that it is not relative .path
message=str(e),
# TODO? dataset_path=dataset_path,
# TODO? dandiset_path=dandiset_path,
dandiset_path=self.dandiset_path,
)
]
except Exception as e:
Expand All @@ -215,7 +219,7 @@ def get_validation_errors(
path=self.filepath, # note that it is not relative .path
message=f"Failed to read metadata: {e}",
# TODO? dataset_path=dataset_path,
# TODO? dandiset_path=dandiset_path,
dandiset_path=self.dandiset_path,
)
]
return []
Expand Down Expand Up @@ -537,6 +541,29 @@ def get_validation_errors(
raise
# TODO: might reraise instead of making it into an error
return _pydantic_errors_to_validation_results([e], str(self.filepath))

from .bids import NWBBIDSAsset

Check notice

Code scanning / CodeQL

Cyclic import

Import of module [dandi.files.bids](1) begins an import cycle.

if not isinstance(self, NWBBIDSAsset):
if self.dandiset_path is None:
errors.append(
ValidationResult(
id="DANDI.NO_DANDISET_FOUND",
origin=ValidationOrigin(
name="dandi", version=dandi.__version__
),
severity=Severity.ERROR,
scope=Scope.FILE,
path=self.filepath,
message="File is not inside a Dandiset",
)
)
else:
errors.extend(
validate_organized_path(
self.path, self.filepath, self.dandiset_path
)
)
return errors


Expand Down
83 changes: 81 additions & 2 deletions dandi/organize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
ATM primarily a sandbox for some functionality for dandi organize
"""

from __future__ import annotations

import binascii
from collections import Counter
from copy import deepcopy
import os
import os.path as op
from pathlib import Path
from pathlib import Path, PurePosixPath
import re
from typing import List
import uuid

import numpy as np

from . import get_logger
from . import __version__, get_logger
from .dandiset import Dandiset
from .exceptions import OrganizeImpossibleError
from .metadata import get_metadata
Expand All @@ -36,6 +38,7 @@
move_file,
yaml_load,
)
from .validate_types import Scope, Severity, ValidationOrigin, ValidationResult

lgr = get_logger()

Expand Down Expand Up @@ -1025,3 +1028,79 @@ def msg_(msg, n, cond=None):
msg_(" %d invalid not considered.", skip_invalid),
dandiset_path.rstrip("/"),
)


LABELREGEX = r"[^_*\\/<>:|\"'?%@;.]+"
ORGANIZED_FILENAME_REGEX = (
rf"sub-{LABELREGEX}"
rf"(_ses-{LABELREGEX})?"
rf"(_(tis|slice|cell|probe|obj)-{LABELREGEX})*"
r"(_[a-z]+(\+[a-z]+)*)?"
r"\.nwb"
)
ORGANIZED_FOLDER_REGEX = rf"sub-{LABELREGEX}"


def validate_organized_path(
asset_path: str, filepath: Path, dandiset_path: Path
) -> list[ValidationResult]:
"""
:param str asset_path:
The forward-slash-separated path to the asset within its local Dandiset
(i.e., relative to the Dandiset's root)
:param pathlib.Path filepath:
The actual filesystem path of the asset (used to construct
`ValidationResult` objects)
:param pathlib.Path dandiset_path:
The path to the root of the Dandiset (used to construct
`ValidationResult` objects)
"""
path = PurePosixPath(asset_path)
if path.suffix != ".nwb":
return []
errors = []
if not re.fullmatch(ORGANIZED_FILENAME_REGEX, path.name):
errors.append(
ValidationResult(
id="DANDI.NON_DANDI_FILENAME",
origin=ValidationOrigin(name="dandi", version=__version__),
severity=Severity.ERROR,
scope=Scope.FILE,
path=filepath,
message="Filename does not conform to Dandi standard",
path_regex=ORGANIZED_FILENAME_REGEX,
dandiset_path=dandiset_path,
)
)
if not (
len(path.parent.parts) == 1
and re.fullmatch(ORGANIZED_FOLDER_REGEX, str(path.parent))
):
errors.append(
ValidationResult(
id="DANDI.NON_DANDI_FOLDERNAME",
origin=ValidationOrigin(name="dandi", version=__version__),
severity=Severity.ERROR,
scope=Scope.FOLDER,
path=filepath,
message="File is not in folder at root with subject name",
path_regex=ORGANIZED_FOLDER_REGEX,
dandiset_path=dandiset_path,
)
)
if not errors:
m = re.match(ORGANIZED_FOLDER_REGEX, path.name)
assert m
if str(path.parent) != m[0]:
errors.append(
ValidationResult(
id="DANDI.METADATA_MISMATCH_SUBJECT",
origin=ValidationOrigin(name="dandi", version=__version__),
severity=Severity.ERROR,
scope=Scope.FILE,
path=filepath,
message="Filename subject does not match folder name subject",
dandiset_path=dandiset_path,
)
)
return errors
19 changes: 15 additions & 4 deletions dandi/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,9 @@ def simple3_nwb(


@pytest.fixture(scope="session")
def simple4_nwb(
simple1_nwb_metadata: Dict[str, Any], tmp_path_factory: pytest.TempPathFactory
) -> str:
def simple4_nwb(tmp_path_factory: pytest.TempPathFactory) -> str:
"""
With, subject, subject_id, species, but including data orientation ambiguity,
With subject, subject_id, species, but including data orientation ambiguity,
the only currently non-critical issue in the dandi schema for nwbinspector validation:
NWBI.check_data_orientation
https://github.com/NeurodataWithoutBorders/nwbinspector/blob/
Expand Down Expand Up @@ -229,6 +227,19 @@ def organized_nwb_dir2(
return tmp_path


@pytest.fixture(scope="session")
def organized_nwb_dir3(
simple4_nwb: str, tmp_path_factory: pytest.TempPathFactory
) -> Path:
tmp_path = tmp_path_factory.mktemp("organized_nwb_dir")
(tmp_path / dandiset_metadata_file).write_text("{}\n")
r = CliRunner().invoke(
organize, ["-f", "copy", "--dandiset-path", str(tmp_path), str(simple4_nwb)]
)
assert r.exit_code == 0, r.stdout
return tmp_path


if TYPE_CHECKING:
from ..support.typing import Literal

Expand Down
Loading