Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repository.copy_tree: omit subdirectories from path when copying #5648

Merged
merged 2 commits into from
Sep 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion aiida/engine/daemon/execmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,10 @@ def upload_calculation(
if data_node.base.repository.get_object(filename_source).file_type == FileType.DIRECTORY:
# If the source object is a directory, we copy its entire contents
data_node.base.repository.copy_tree(filepath_target, filename_source)
provenance_exclude_list.extend(data_node.base.repository.list_object_names(filename_source))
sources = data_node.base.repository.list_object_names(filename_source)
if filename_target:
sources = [str(pathlib.Path(filename_target) / subpath) for subpath in sources]
provenance_exclude_list.extend(sources)
else:
# Otherwise, simply copy the file
with folder.open(target, 'wb') as handle:
Expand Down
8 changes: 6 additions & 2 deletions aiida/repository/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,10 @@ def walk(self, path: FilePath = None) -> Iterable[Tuple[pathlib.PurePosixPath, L
def copy_tree(self, target: Union[str, pathlib.Path], path: FilePath = None) -> None:
"""Copy the contents of the entire node repository to another location on the local file system.

.. note:: If ``path`` is specified, only its contents are copied, and the relative path with respect to the
root is discarded. For example, if ``path`` is ``relative/sub``, the contents of ``sub`` will be copied
directly to the target, without the ``relative/sub`` directory.

:param target: absolute path of the directory where to copy the contents to.
:param path: optional relative path whose contents to copy.
:raises TypeError: if ``target`` is of incorrect type or not absolute.
Expand All @@ -509,11 +513,11 @@ def copy_tree(self, target: Union[str, pathlib.Path], path: FilePath = None) ->

for root, dirnames, filenames in self.walk(path):
for dirname in dirnames:
dirpath = target / root / dirname
dirpath = target / (root / dirname).relative_to(path)
dirpath.mkdir(parents=True, exist_ok=True)

for filename in filenames:
dirpath = target / root
dirpath = target / root.relative_to(path)
filepath = dirpath / filename

dirpath.mkdir(parents=True, exist_ok=True)
Expand Down
86 changes: 71 additions & 15 deletions tests/engine/daemon/test_execmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

import pytest

from aiida.common.datastructures import CalcInfo, CodeInfo
from aiida.engine.daemon import execmanager
from aiida.orm import CalcJobNode, FolderData, SinglefileData
from aiida.transports.plugins.local import LocalTransport


Expand Down Expand Up @@ -75,6 +77,34 @@ def file_hierarchy():
}


@pytest.fixture
def file_hierarchy_simple():
"""Return a simple nested file hierarchy."""
return {
'sub': {
'b': 'file_b',
},
'a': 'file_a',
}


@pytest.fixture
def node_and_calc_info(aiida_localhost, aiida_local_code_factory):
"""Return a ``CalcJobNode`` and associated ``CalcInfo`` instance."""
node = CalcJobNode(computer=aiida_localhost)
node.store()

code = aiida_local_code_factory('core.arithmetic.add', '/bin/bash').store()
code_info = CodeInfo()
code_info.code_uuid = code.uuid

calc_info = CalcInfo()
calc_info.uuid = node.uuid
calc_info.codes_info = [code_info]

return node, calc_info


def test_hierarchy_utility(file_hierarchy, tmp_path):
"""Test that the ``create_file_hierarchy`` and ``serialize_file_hierarchy`` function as intended.

Expand All @@ -85,7 +115,7 @@ def test_hierarchy_utility(file_hierarchy, tmp_path):


# yapf: disable
@pytest.mark.usefixtures('aiida_profile_clean')
@pytest.mark.usefixtures('aiida_profile')
@pytest.mark.parametrize('retrieve_list, expected_hierarchy', (
# Single file or folder, either toplevel or nested
(['file_a.txt'], {'file_a.txt': 'file_a'}),
Expand Down Expand Up @@ -130,15 +160,49 @@ def test_retrieve_files_from_list(
assert serialize_file_hierarchy(target) == expected_hierarchy


@pytest.mark.usefixtures('aiida_profile_clean')
def test_upload_local_copy_list(fixture_sandbox, aiida_localhost, aiida_local_code_factory, file_hierarchy, tmp_path):
# yapf: disable
@pytest.mark.usefixtures('aiida_profile')
@pytest.mark.parametrize(('local_copy_list', 'expected_hierarchy'), (
([None, None], {'sub': {'b': 'file_b'}, 'a': 'file_a'}),
(['.', None], {'sub': {'b': 'file_b'}, 'a': 'file_a'}),
([None, '.'], {'sub': {'b': 'file_b'}, 'a': 'file_a'}),
(['.', '.'], {'sub': {'b': 'file_b'}, 'a': 'file_a'}),
([None, ''], {'sub': {'b': 'file_b'}, 'a': 'file_a'}),
(['sub', None], {'b': 'file_b'}),
([None, 'target'], {'target': {'sub': {'b': 'file_b'}, 'a': 'file_a'}}),
(['sub', 'target'], {'target': {'b': 'file_b'}}),
))
# yapf: enable
def test_upload_local_copy_list(
fixture_sandbox, node_and_calc_info, file_hierarchy_simple, tmp_path, local_copy_list, expected_hierarchy
):
"""Test the ``local_copy_list`` functionality in ``upload_calculation``."""
create_file_hierarchy(file_hierarchy_simple, tmp_path)
folder = FolderData()
folder.base.repository.put_object_from_tree(tmp_path)
folder.store()

node, calc_info = node_and_calc_info
calc_info.local_copy_list = [[folder.uuid] + local_copy_list]

with LocalTransport() as transport:
execmanager.upload_calculation(node, transport, calc_info, fixture_sandbox)

# Check that none of the files were written to the repository of the calculation node, since they were communicated
# through the ``local_copy_list``.
assert node.base.repository.list_object_names() == []

# Now check that all contents were successfully written to the sandbox
written_hierarchy = serialize_file_hierarchy(pathlib.Path(fixture_sandbox.abspath))
assert written_hierarchy == expected_hierarchy


@pytest.mark.usefixtures('aiida_profile')
def test_upload_local_copy_list_files_folders(fixture_sandbox, node_and_calc_info, file_hierarchy, tmp_path):
"""Test the ``local_copy_list`` functionality in ``upload_calculation``.

Specifically, verify that files in the ``local_copy_list`` do not end up in the repository of the node.
"""
from aiida.common.datastructures import CalcInfo, CodeInfo
from aiida.orm import CalcJobNode, FolderData, SinglefileData

create_file_hierarchy(file_hierarchy, tmp_path)
folder = FolderData()
folder.base.repository.put_object_from_tree(tmp_path)
Expand All @@ -149,16 +213,8 @@ def test_upload_local_copy_list(fixture_sandbox, aiida_localhost, aiida_local_co
'folder': folder.store(),
}

node = CalcJobNode(computer=aiida_localhost)
node.store()

code = aiida_local_code_factory('core.arithmetic.add', '/bin/bash').store()
code_info = CodeInfo()
code_info.code_uuid = code.uuid
node, calc_info = node_and_calc_info

calc_info = CalcInfo()
calc_info.uuid = node.uuid
calc_info.codes_info = [code_info]
calc_info.local_copy_list = [
(inputs['file_x'].uuid, inputs['file_x'].filename, './files/file_x'),
(inputs['file_y'].uuid, inputs['file_y'].filename, './files/file_y'),
Expand Down
48 changes: 36 additions & 12 deletions tests/repository/test_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,31 @@ def tmp_path_parametrized(request, tmp_path_factory) -> t.Union[str, pathlib.Pat
yield tmp_path


def serialize_file_hierarchy(dirpath: pathlib.Path):
"""Serialize the file hierarchy at ``dirpath``.

.. note:: empty directories are ignored.

:param dirpath: the base path.
:return: a mapping representing the file hierarchy, where keys are filenames. The leafs correspond to files and the
values are the text contents.
"""
import os
serialized = {}

for root, _, files in os.walk(dirpath):
for filepath in files:

relpath = pathlib.Path(root).relative_to(dirpath)
subdir = serialized
if relpath.parts:
for part in relpath.parts:
subdir = subdir.setdefault(part, {})
subdir[filepath] = (pathlib.Path(root) / filepath).read_bytes()

return serialized


def test_uuid(repository_uninitialised):
"""Test the ``uuid`` property."""
repository = repository_uninitialised
Expand Down Expand Up @@ -561,21 +586,20 @@ def test_walk(repository, generate_directory):
]


@pytest.mark.parametrize('path', ('.', 'relative'))
def test_copy_tree(repository, generate_directory, tmp_path_parametrized, path):
# yapf: disable
@pytest.mark.parametrize(('path', 'expected_hierarchy'), (
(None, {'file_a': b'a', 'relative': {'file_b': b'b', 'sub': {'file_c': b'c'}}}),
('.', {'file_a': b'a', 'relative': {'file_b': b'b', 'sub': {'file_c': b'c'}}}),
('relative', {'file_b': b'b', 'sub': {'file_c': b'c'}}),
('relative/sub', {'file_c': b'c'}),
Copy link
Member

@unkcpz unkcpz Sep 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused about this, I think the last test case should be like this below? But this is what are you going to fix?

('relative/sub', {'sub': {'file_c': b'c'}}),

I made a test in my command line:

╭─jyu at theospc38 in /tmp
╰─○ mkdir target_d
╭─jyu at theospc38 in /tmp
╰─○ mkdir -p relative/sub
╭─jyu at theospc38 in /tmp
╰─○ touch relative/b
╭─jyu at theospc38 in /tmp
╰─○ touch relative/sub/c
╭─jyu at theospc38 in /tmp
╰─○ cp -r relative/sub target_d
╭─jyu at theospc38 in /tmp
╰─○ ls target_d 
sub
╭─jyu at theospc38 in /tmp
╰─○ ls target_d/sub 
c

My cp version is cp (GNU coreutils) 8.30.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, the behavior of cp is different. Note that tat is not just for the last test case but also the penultimate. Essentially, the last directory in the path is kept. One would have to do cp -r relative/sub/* target_d to achieve what is currently suggested by the test.

I think the docstring of copytree is consistent with the implemented behavior. It is just that my commit message regarding the parallel with cp is incorrect. I am wondering if that should be the behavior though. I am actually mirroring the behavior of shutil.copytree. If you run shutil.copytree('relative/sub', 'target_d'), it would simply create target_d and copy the file c into it. Which is what Repository.copy_tree currently does.

Also, if we were to emulate the cp behavior, then I am not sure what someone would have to do if they didn't want to keep the sub directory. In cp you would a glob, but that is not supported in the Python API.

So in short, I think we should keep the current implementation but I correct the commit message saying that it mirrors shutil.copytree and remove the mention of cp. Would that work?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. I'll continue to review along this line. Thanks for the explanation!

))
# yapf: enable
def test_copy_tree(repository, generate_directory, tmp_path_parametrized, path, expected_hierarchy):
"""Test the ``Repository.copy_tree`` method."""
directory = generate_directory({'file_a': None, 'relative': {'file_b': None, 'sub': {'file_c': None}}})
directory = generate_directory({'file_a': b'a', 'relative': {'file_b': b'b', 'sub': {'file_c': b'c'}}})
repository.put_object_from_tree(str(directory))

repository.copy_tree(tmp_path_parametrized, path=path)
for root, dirnames, filenames in repository.walk(path):
for dirname in dirnames:
assert pathlib.Path(tmp_path_parametrized / root / dirname).is_dir()
for filename in filenames:
filepath = pathlib.Path(tmp_path_parametrized / root / filename)
assert filepath.is_file()
with repository.open(root / filename) as handle:
assert filepath.read_bytes() == handle.read()
assert serialize_file_hierarchy(tmp_path_parametrized) == expected_hierarchy


@pytest.mark.parametrize(('argument', 'value', 'exception', 'match'), (
Expand Down