Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datalad sensitive marking fixes #739

Merged
merged 4 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions heudiconv/external/dlad.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,25 @@ def add_to_datalad(
message="Added gitattributes to place all .heudiconv content"
" under annex",
)
ds.save(
save_res = ds.save(
".",
recursive=True
# not in effect! ?
# annex_add_opts=['--include-dotfiles']
)
annexed_files = [sr["path"] for sr in save_res if sr.get("key", None)]

# TODO: filter for only changed files?
# Provide metadata for sensitive information
mark_sensitive(ds, "sourcedata")
mark_sensitive(ds, "*_scans.tsv") # top level
mark_sensitive(ds, "*/*_scans.tsv") # within subj
mark_sensitive(ds, "*/*/*_scans.tsv") # within sess/subj
mark_sensitive(ds, "*/anat") # within subj
mark_sensitive(ds, "*/*/anat") # within ses/subj
sensitive_patterns = [
"sourcedata",
"*_scans.tsv", # top level
"*/*_scans.tsv", # within subj
"*/*/*_scans.tsv", # within sess/subj
"*/anat", # within subj
"*/*/anat", # within ses/subj
]
for sp in sensitive_patterns:
mark_sensitive(ds, sp, annexed_files)
if dsh_path:
mark_sensitive(ds, ".heudiconv") # entire .heudiconv!
superds.save(path=ds.path, message=msg, recursive=True)
Expand All @@ -178,26 +182,30 @@ def add_to_datalad(
"""


def mark_sensitive(ds: Dataset, path_glob: str) -> None:
def mark_sensitive(ds: Dataset, path_glob: str, files: list[str] | None = None) -> None:
"""

Parameters
----------
ds : Dataset to operate on
path_glob : str
glob of the paths within dataset to work on
files : list[str]
subset of files to mark

Returns
-------
None
"""
paths = glob(op.join(ds.path, path_glob))
if files:
paths = [p for p in paths if p in files]
if not paths:
return
lgr.debug("Marking %d files with distribution-restrictions field", len(paths))
# set_metadata can be a bloody generator
res = ds.repo.set_metadata(
paths, init=dict([("distribution-restrictions", "sensitive")]), recursive=True
paths, add=dict([("distribution-restrictions", "sensitive")]), recursive=True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here I worried in case of --overwrite we might keep adding multiple distribution-restrictions=sensitive... but it seems that same value is not getting duplicated, only a new one

❯ git annex metadata --set distribution-restrictions+=sensitive sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-47
  lastchanged=2024-02-24@00-12-47
ok
(recording state in git...)
❯ git annex metadata --set distribution-restrictions+=sensitive sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-49
  lastchanged=2024-02-24@00-12-49
ok
(recording state in git...)
❯ git annex metadata  sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb
metadata sub-YutaMouse20_ses-YutaMouse20-140321_behavior+ecephys.nwb 
  distribution-restrictions=sensitive
  distribution-restrictions-lastchanged=2024-02-24@00-12-49
  lastchanged=2024-02-24@00-12-49

only the time stamp would be changed ... since we are filtering on only saved files -- I think that should be good

)
if inspect.isgenerator(res):
res = list(res)
21 changes: 21 additions & 0 deletions heudiconv/external/tests/test_dlad.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,24 @@ def test_mark_sensitive(tmp_path: Path) -> None:
# g2 since the same content
assert not all_meta.pop("g1", None) # nothing or empty record
assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec}


def test_mark_sensitive_subset(tmp_path: Path) -> None:
ds = dl.Dataset(tmp_path).create(force=True)
create_tree(
str(tmp_path),
{
"f1": "d1",
"f2": "d2",
"g1": "d3",
"g2": "d1",
},
)
ds.save(".")
mark_sensitive(ds, "f*", [str(tmp_path / "f1")])
all_meta = dict(ds.repo.get_metadata("."))
target_rec = {"distribution-restrictions": ["sensitive"]}
# g2 since the same content
assert not all_meta.pop("g1", None) # nothing or empty record
assert not all_meta.pop("f2", None) # nothing or empty record
assert all_meta == {"f1": target_rec, "g2": target_rec}
Loading