Skip to content

Commit

Permalink
Add provenance_exclude_list attribute to CalcInfo data structure (#…
Browse files Browse the repository at this point in the history
…3720)

This new attribute takes a flat list of relative filepaths, which
correspond to files in the `folder` sandbox passed to the
`prepare_for_submission` call of the `CalcJob`, that should not be
copied to the repository of the `CalcJobNode`. This functionality is
useful to avoid the content of input files, that should be copied to the
working directory of the calculation, to also be stored permanently in
the file repository. Example use cases are for very large input files or
files whose content is proprietary. Both use cases could already be
implemented using the `local_copy_list` but only in the case of files of
an input node in its entirety. The syntax of the `local_copy_list` does
not support the exclusion of arbitrary files that are written by the
calculation plugin to the sandbox folder.

Before the addition of this new feature, the contents of the sandbox
folder were added to the repository of the calculation node simply by
moving the contents of the sandbox entirely to the repository. This was
changed to an explicit loop over the contents and only copying those
files that do not appear in the `provenance_exclude_list` list.

The advantage of recursively looping over the contents of the sandbox
folder and *copying* the contents to the repository as long as it is not
part of `provenance_exclude_list`, over deleting those excluded files
from the sandbox before *moving* the remaining content to the
repository, is that in the former there is a better guarantee that the
excluded files do not accidentally end up in the repository due to an
unnoticed problem in the deletion from the sandbox.

The moving method is of course a lot more efficient then copying files
one by one. However, this moving approach is only possible now that the
repository is still implemented on the same filesystem as the sandbox.
Once the new repository interface is fully implemented, where non file
system repositories are also possible, moving the sandbox folder to the
repository will no longer be possible anyway, so it is acceptable to
already make this change now, since it will have to be done at some
point anyway.
  • Loading branch information
sphuber authored Feb 10, 2020
1 parent 48573df commit 53bbc74
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 3 deletions.
7 changes: 7 additions & 0 deletions aiida/common/datastructures.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ class CalcInfo(DefaultFieldsAttributeDict):
* local_copy_list: a list of tuples with format ('node_uuid', 'filename', relativedestpath')
* remote_copy_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
* remote_symlink_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
* provenance_exclude_list: a sequence of relative paths of files in the sandbox folder of a `CalcJob` instance that
should not be stored permanantly in the repository folder of the corresponding `CalcJobNode` that will be
created, but should only be copied to the remote working directory on the target computer. This is useful for
input files that should be copied to the working directory but should not be copied as well to the repository
either, for example, because they contain proprietary information or because they are big and their content is
already indirectly present in the repository through one of the data nodes passed as input to the calculation.
* codes_info: a list of dictionaries used to pass the info of the execution of a code
* codes_run_mode: a string used to specify the order in which multi codes can be executed
"""
Expand All @@ -91,6 +97,7 @@ class CalcInfo(DefaultFieldsAttributeDict):
'local_copy_list',
'remote_copy_list',
'remote_symlink_list',
'provenance_exclude_list',
'codes_info',
'codes_run_mode'
)
Expand Down
20 changes: 17 additions & 3 deletions aiida/engine/daemon/execmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ def upload_calculation(node, transport, calc_info, folder, inputs=None, dry_run=
raise ValueError('Cannot submit calculation {} because it has cached input links! If you just want to test the '
'submission, set `metadata.dry_run` to True in the inputs.'.format(node.pk))

# After this call, no modifications to the folder should be done
node.put_object_from_tree(folder.abspath, force=True)

# If we are performing a dry-run, the working directory should actually be a local folder that should already exist
if dry_run:
workdir = transport.getcwd()
Expand Down Expand Up @@ -244,6 +241,23 @@ def find_data_node(inputs, uuid):
raise IOError('It is not possible to create a symlink between two different machines for '
'calculation {}'.format(node.pk))

provenance_exclude_list = calc_info.provenance_exclude_list or []

# Loop recursively over content of the sandbox folder copying all that are not in `provenance_exclude_list`. Note
# that directories are not created explicitly. The `node.put_object_from_filelike` call will create intermediate
# directories for nested files automatically when needed. This means though that empty folders in the sandbox or
# folders that would be empty when considering the `provenance_exclude_list` will *not* be copied to the repo. The
# advantage of this explicit copying instead of deleting the files from `provenance_exclude_list` from the sandbox
# first before moving the entire remaining content to the node's repository, is that in this way we are guaranteed
# not to accidentally move files to the repository that should not go there at all cost.
for root, dirnames, filenames in os.walk(folder.abspath):
for filename in filenames:
filepath = os.path.join(root, filename)
relpath = os.path.relpath(filepath, folder.abspath)
if relpath not in provenance_exclude_list:
with open(filepath, 'rb') as handle:
node.put_object_from_filelike(handle, relpath, 'wb', force=True)

if not dry_run:
# Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest
# chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it
Expand Down
50 changes: 50 additions & 0 deletions docs/source/working/calculations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,15 @@ The implementation of the ``ArithmeticAddCalculation`` that we are considering i
Before we go into the code line-by-line, let's describe the big picture of what is happening here.
The goal of this method is to help the engine accomplish the three steps required for preparing the submission a calculation job, as described above.
The raw input files that are required can be written to a sandbox folder that is passed in as the ``folder`` argument.
.. note::
The ``folder`` argument points to a temporary sandbox folder on the local file system that can be used to write the input files to.
After the ``prepare_for_submission`` method returns, the engine will take those contents and copy them to the working directory where the calculation will be run.
On top of that, these files will also be written to the file repository of the node that represents the calculation as an additional measure of provenance.
Even though the information written there should be a derivation of the contents of the nodes that were passed as input nodes, since it is a derived form we store this explicitly nonetheless.
Sometimes, this behavior is undesirable, for example for efficiency or data privacy reasons, so it can be controlled with various lists such as :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>` and :ref:`provenance_exclude_list <working_calcjobs_file_lists_provenance_exclude>`.
All the other required information, such as the directives of which files to copy and what command line options to use are defined through the :py:class:`~aiida.common.datastructures.CalcInfo` datastructure, which should be returned from the method as the only value.
In principle, this is what one **should do** in the ``prepare_for_submission`` method:
Expand Down Expand Up @@ -244,6 +253,47 @@ If instead, you need to transfer a specific file from a ``FolderData``, you can
Note that the filenames in the relative source and target path need not be the same.
This depends fully on how the files are stored in the node's repository and what files need to be written to the working directory.
One might think what the purpose of the list is, when one could just as easily use normal the normal API to write the file to the ``folder`` sandbox folder.
It is true, that in this way the file will be copied to the working directory, however, then it will *also* be copied into the repository of the calculation node.
Since in this case it is merely a direct one-to-one copy of the file that is already part of one of the input nodes (in an unaltered form), this duplication is unnecessary and adds useless weight to the file repository.
Using the ``local_copy_list`` prevents this unnecessary duplication of file content.
It can also be used if the content of a particular input node is privacy sensitive and cannot be duplicated in the repository.
.. _working_calcjobs_file_lists_provenance_exclude:
Provenance exclude list
~~~~~~~~~~~~~~~~~~~~~~~
The :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>` allows one to instruct the engine to write files from the input files to the working directory, without them *also* being copied to the file repository of the calculation node.
As discussed in the corresponding section, this is useful in order to avoid duplication or in case where the data of the nodes is proprietary or privacy sensitive and cannot be duplicated arbitrarily everywhere in the file repository.
However, the limitation of the ``local_copy_list`` is that the it can only target single files in its entirety and cannot be used for arbitrary files that are written to the ``folder`` sandbox folder.
To provide full control over what files from the ``folder`` are stored permanently in the calculation node file repository, the ``provenance_exclude_list`` is introduced.
This :py:class:`~aiida.common.datastructures.CalcInfo` attribute is a list of filepaths, relative to the base path of the ``folder`` sandbox folder, which *are not stored* in the file repository.
Consider the following file structure as written by an implementation of ``prepare_for_submission`` to the ``folder`` sandbox:
.. code:: bash
├─ sub
│ ├─ file_b.txt
│ └─ personal.dat
├─ file_a.txt
└─ secret.key
Clearly, we do not want the ``personal.dat`` and ``secret.key`` files to end up permanently in the file repository.
This can be achieved by defining:
.. code:: python
calc_info.provenance_exclude_list = ['sub/personal.dat', 'secret.key']
With this specification, the final contents of the repository of the calculation node will contain:
.. code:: bash
├─ sub
│ └─ file_b.txt
└─ file_a.txt
.. _working_calcjobs_file_lists_remote_copy:
Remote copy list
Expand Down
84 changes: 84 additions & 0 deletions tests/engine/test_calc_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,43 @@ def raise_exception(exception):
raise exception()


class FileCalcJob(CalcJob):
"""Example `CalcJob` implementation to test the `provenance_exclude_list` functionality.
The content of the input `files` will be copied to the `folder` sandbox, but also added to the attribute
`provenance_exclude_list` of the `CalcInfo` which should instruct the engine to copy the files to the remote work
directory but NOT to the repository of the `CalcJobNode`.
"""

@classmethod
def define(cls, spec):
super().define(spec)
spec.input('settings', valid_type=orm.Dict)
spec.input_namespace('files', valid_type=orm.SinglefileData, dynamic=True)

def prepare_for_submission(self, folder):
from aiida.common.datastructures import CalcInfo, CodeInfo

for key, node in self.inputs.files.items():
filepath = key.replace('_', os.sep)
dirname = os.path.dirname(filepath)
basename = os.path.basename(filepath)
with node.open(mode='rb') as source:
if dirname:
subfolder = folder.get_subfolder(dirname, create=True)
subfolder.create_file_from_filelike(source, basename)
else:
folder.create_file_from_filelike(source, filepath)

codeinfo = CodeInfo()
codeinfo.code_uuid = self.inputs.code.uuid

calcinfo = CalcInfo()
calcinfo.codes_info = [codeinfo]
calcinfo.provenance_exclude_list = self.inputs.settings.get_attribute('provenance_exclude_list')
return calcinfo


class TestCalcJob(AiidaTestCase):
"""Test for the `CalcJob` process sub class."""

Expand Down Expand Up @@ -231,3 +268,50 @@ def test_run_local_code(self):
# Since the repository will only contain files on the top-level due to `Code.set_files` we only check those
for filename in self.local_code.list_object_names():
self.assertTrue(filename in uploaded_files)

def test_provenance_exclude_list(self):
"""Test the functionality of the `CalcInfo.provenance_exclude_list` attribute."""
import tempfile

code = orm.Code(input_plugin_name='arithmetic.add', remote_computer_exec=[self.computer, '/bin/true']).store()

with tempfile.NamedTemporaryFile('w+') as handle:
handle.write('dummy_content')
handle.flush()
file_one = orm.SinglefileData(file=handle.name)

with tempfile.NamedTemporaryFile('w+') as handle:
handle.write('dummy_content')
handle.flush()
file_two = orm.SinglefileData(file=handle.name)

inputs = {
'code': code,
'files': {
# Note the `FileCalcJob` will turn underscores in the key into forward slashes making a nested hierarchy
'base_a_sub_one': file_one,
'base_b_two': file_two,
},
'settings': orm.Dict(dict={'provenance_exclude_list': ['base/a/sub/one']}),
'metadata': {
'dry_run': True,
'options': {
'resources': {
'num_machines': 1,
'num_mpiprocs_per_machine': 1
}
}
}
}

# We perform a `dry_run` because the calculation cannot actually run, however, the contents will still be
# written to the node's repository so we can check it contains the expected contents.
_, node = launch.run_get_node(FileCalcJob, **inputs)

self.assertIn('folder', node.dry_run_info)

# Verify that the folder (representing the node's repository) indeed do not contain the input files. Note,
# however, that the directory hierarchy should be there, albeit empty
self.assertIn('base', node.list_object_names())
self.assertEqual(sorted(['b']), sorted(node.list_object_names(os.path.join('base'))))
self.assertEqual(['two'], node.list_object_names(os.path.join('base', 'b')))

0 comments on commit 53bbc74

Please sign in to comment.