Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add datalad task #594

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/testpydra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,25 @@ jobs:
- name: Update pip
run: python -m pip install --upgrade pip

- name: Set-up Git-annex
run: |
if [ "${{ runner.os }}" == "Linux" ]; then
wget -O- http://neuro.debian.net/lists/jammy.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list
sudo apt-key adv --recv-keys --keyserver hkps://keyserver.ubuntu.com 0xA5D32F012649A5A9
sudo apt-get update
sudo apt-get install git-annex-standalone
elif [ "${{ runner.os }}" == "macOS" ]; then
brew install git-annex
elif [ "${{ runner.os }}" == "Windows" ]; then
pip install datalad-installer
datalad-installer git-annex -m datalad/packages
fi

- name: Set-up git credentials
run: |
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git config --global user.name "github-actions[bot]"

- name: Determine installation target
run: |
if [[ "$INSTALL" = "sdist" ]]; then
Expand Down
13 changes: 13 additions & 0 deletions pydra/engine/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@
reason="sge not available",
)

need_gitannex = pytest.mark.skipif(
not (shutil.which("git-annex"))
or bool(
float(
sp.check_output(["git-annex", "version", "--raw"], universal_newlines=True)[
:6
]
)
< 8.20200309
),
reason="git-annex is not installed or version is less than 8.20200309",
)


def result_no_submitter(shell_task, plugin=None):
"""helper function to return result when running without submitter"""
Expand Down
184 changes: 184 additions & 0 deletions pydra/tasks/datalad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""A :obj:`~nipype.interfaces.utility.base.IdentityInterface` with a grafted Datalad getter."""
import os
import logging
import typing as ty
from pathlib import Path
from ..engine.specs import (
File,
Directory,
SpecInfo,
BaseSpec,
)
from ..engine.core import TaskBase
from ..engine.helpers import output_from_inputfields
from ..utils.messenger import AuditFlag

logger = logging.getLogger("pydra.tasks.datalad")

input_fields = [
(
"in_file",
str,
{
"help_string": "Path to the data to be downloaded through datalad",
"mandatory": True,
},
),
(
"dataset_path",
Directory,
{
"help_string": "Path to the dataset that will be used to get data",
"mandatory": True,
},
),
(
"dataset_url",
str,
{
"help_string": "URL to the dataset that will be used to get data",
},
),
]


output_fields = [
(
"out_file",
File,
{
"help_string": "file downloaded through datalad",
"requires": ["in_file"],
"output_file_template": "{in_file}",
},
)
]

# define a TaskBase calss
class DataladInterface(TaskBase):
"""A :obj:`~nipype.interfaces.utility.base.IdentityInterface` with a grafted Datalad getter."""

def __init__(
self,
name: str,
audit_flags: AuditFlag = AuditFlag.NONE,
cache_dir=None,
cache_locations=None,
input_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None,
output_spec: ty.Optional[ty.Union[SpecInfo, BaseSpec]] = None,
cont_dim=None,
messenger_args=None,
messengers=None,
rerun=False,
**kwargs,
):
"""Initialize a DataladInterface instance."""

self.input_spec = input_spec or SpecInfo(
name="Inputs", fields=input_fields, bases=(BaseSpec,)
)
self.output_spec = output_spec or SpecInfo(
name="Output", fields=output_fields, bases=(BaseSpec,)
)
self.output_spec = output_from_inputfields(self.output_spec, self.input_spec)
super().__init__(
name=name,
inputs=kwargs,
audit_flags=audit_flags,
cache_dir=cache_dir,
cache_locations=cache_locations,
cont_dim=cont_dim,
messenger_args=messenger_args,
messengers=messengers,
rerun=rerun,
)

def _run_task(self):
in_file = self.inputs.in_file
dataset_path = self.inputs.dataset_path

_dl_found = False
try:
import datalad.api as dl

_dl_found = True
except:
raise ImportError("Datalad is not installed.")

# checking if the dataset is already downloaded

if not (Path(dataset_path) / ".datalad").exists():
logger.info("Datalad interface without dataset path defined.")
try:
dataset_url = self.inputs.dataset_url
os.makedirs(dataset_path, exist_ok=True)
dl.install(source=dataset_url, path=dataset_path)
except Exception as e:
logger.error(e)
else:
ds = dl.Dataset(self.inputs.dataset_path)

# getting the file
ds.get(self.inputs.in_file)

# checking if the file was downloaded
if not Path(dataset_path, in_file).exists():
raise FileNotFoundError(f"File {in_file} not found in {dataset_path}")

_pth = Path(in_file)
if not _pth.is_absolute():
_pth = dataset_path / _pth

_datalad_candidate = _pth.is_symlink() and not _pth.exists()
if not _datalad_candidate:
logger.warning("datalad was required but not found")

if _datalad_candidate:
try:
result = dl.get(_pth, dataset=dataset_path)
except Exception as exc:
logger.warning(f"datalad get on {_pth} failed.")
## discussed with @djarecka, we keep it commented here for now
## do we still need it for pydra?
# if (
# config.environment.exec_env == "docker"
# and ("This repository is not initialized for use by git-annex, "
# "but .git/annex/objects/ exists") in f"{exc}"
# ):
# logger.warning(
# "Execution seems containerirzed with Docker, please make sure "
# "you are not running as root. To do so, please add the argument "
# "``-u $(id -u):$(id -g)`` to your command line."
# )
# else:
# logger.warning(str(exc))
else:
if result[0]["status"] == "error":
logger.warning(f"datalad get failed: {result}")

self.output_ = None
output = os.path.abspath(
os.path.join(self.inputs.dataset_path, self.inputs.in_file)
)
output_names = [el[0] for el in self.output_spec.fields]
if output is None:
self.output_ = {nm: None for nm in output_names}
elif len(output_names) == 1:
# if only one element in the fields, everything should be returned together
self.output_ = {output_names[0]: output}
elif isinstance(output, tuple) and len(output_names) == len(output):
self.output_ = dict(zip(output_names, output))
else:
raise RuntimeError(
f"expected {len(self.output_spec.fields)} elements, "
f"but {output} were returned"
)
# outputs = self.output_spec().get()
# outputs["out_file"] = os.path.abspath(os.path.join(self.inputs.dataset_path, self.inputs.in_file))

def _list_outputs(self):
outputs = self.output_spec().get()
outputs["out_file"] = os.path.abspath(
os.path.join(self.inputs.dataset_path, self.inputs.in_file)
)
return outputs
Empty file added pydra/tasks/tests/__init__.py
Empty file.
52 changes: 52 additions & 0 deletions pydra/tasks/tests/test_datalad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import typing as ty
from pathlib import Path
import os, sys
import attr
import pytest


from ...tasks.datalad import DataladInterface
from ...engine.core import Workflow
from ...engine.submitter import Submitter
from ...engine.helpers import hash_value
from ...engine.tests.utils import need_gitannex


@need_gitannex
def test_datalad_interface(tmpdir):
"""
Testing datalad interface
"""
import datalad.api as dl

# change PosixPath to str
tmpdir = str(tmpdir)
# creating a dataset
ds = dl.Dataset(tmpdir).create()
ds.save()
ds_path = ds.pathobj

# creating a file to download
file_path = ds_path / "file.txt"
file_path.write_text("test")
ds.save()

tmpdir = Path(tmpdir)

# install the dataset to a new location
ds2 = dl.install(source=tmpdir, path=tmpdir / "ds2")
ds2_path = ds2.pathobj

# use datalad interface to download the file
dl_interface = DataladInterface(
name="dl_interface", in_file="file.txt", dataset_path=ds2_path
)
# running the task
res = dl_interface()

assert os.path.exists(res.output.out_file)
assert os.path.basename(res.output.out_file) == "file.txt"


# Path: pydra/tasks/tests/test_datalad.py
# Compare this snippet from pydra/tasks/datalad.py:
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ test = [
"tornado",
"boutiques",
"pympler",
"datalad",
]
# Aliases
tests = ["pydra[test]"]
Expand Down