Add provenance_exclude_list attribute to CalcInfo data structure (#…

…3720) This new attribute takes a flat list of relative filepaths, which correspond to files in the `folder` sandbox passed to the `prepare_for_submission` call of the `CalcJob`, that should not be copied to the repository of the `CalcJobNode`. This functionality is useful to avoid the content of input files, that should be copied to the working directory of the calculation, to also be stored permanently in the file repository. Example use cases are for very large input files or files whose content is proprietary. Both use cases could already be implemented using the `local_copy_list` but only in the case of files of an input node in its entirety. The syntax of the `local_copy_list` does not support the exclusion of arbitrary files that are written by the calculation plugin to the sandbox folder. Before the addition of this new feature, the contents of the sandbox folder were added to the repository of the calculation node simply by moving the contents of the sandbox entirely to the repository. This was changed to an explicit loop over the contents and only copying those files that do not appear in the `provenance_exclude_list` list. The advantage of recursively looping over the contents of the sandbox folder and *copying* the contents to the repository as long as it is not part of `provenance_exclude_list`, over deleting those excluded files from the sandbox before *moving* the remaining content to the repository, is that in the former there is a better guarantee that the excluded files do not accidentally end up in the repository due to an unnoticed problem in the deletion from the sandbox. The moving method is of course a lot more efficient then copying files one by one. However, this moving approach is only possible now that the repository is still implemented on the same filesystem as the sandbox. Once the new repository interface is fully implemented, where non file system repositories are also possible, moving the sandbox folder to the repository will no longer be possible anyway, so it is acceptable to already make this change now, since it will have to be done at some point anyway.
aiidateam · Feb 10, 2020 · 53bbc74 · 53bbc74
1 parent 48573df
commit 53bbc74
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 3 deletions.
diff --git a/aiida/common/datastructures.py b/aiida/common/datastructures.py
@@ -67,6 +67,12 @@ class CalcInfo(DefaultFieldsAttributeDict):
     * local_copy_list: a list of tuples with format ('node_uuid', 'filename', relativedestpath')
     * remote_copy_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
     * remote_symlink_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
+    * provenance_exclude_list: a sequence of relative paths of files in the sandbox folder of a `CalcJob` instance that
+        should not be stored permanantly in the repository folder of the corresponding `CalcJobNode` that will be
+        created, but should only be copied to the remote working directory on the target computer. This is useful for
+        input files that should be copied to the working directory but should not be copied as well to the repository
+        either, for example, because they contain proprietary information or because they are big and their content is
+        already indirectly present in the repository through one of the data nodes passed as input to the calculation.
     * codes_info: a list of dictionaries used to pass the info of the execution of a code
     * codes_run_mode: a string used to specify the order in which multi codes can be executed
     """
@@ -91,6 +97,7 @@ class CalcInfo(DefaultFieldsAttributeDict):
         'local_copy_list',
         'remote_copy_list',
         'remote_symlink_list',
+        'provenance_exclude_list',
         'codes_info',
         'codes_run_mode'
     )

diff --git a/aiida/engine/daemon/execmanager.py b/aiida/engine/daemon/execmanager.py
@@ -62,9 +62,6 @@ def upload_calculation(node, transport, calc_info, folder, inputs=None, dry_run=
         raise ValueError('Cannot submit calculation {} because it has cached input links! If you just want to test the '
                          'submission, set `metadata.dry_run` to True in the inputs.'.format(node.pk))
 
-    # After this call, no modifications to the folder should be done
-    node.put_object_from_tree(folder.abspath, force=True)
-
     # If we are performing a dry-run, the working directory should actually be a local folder that should already exist
     if dry_run:
         workdir = transport.getcwd()
@@ -244,6 +241,23 @@ def find_data_node(inputs, uuid):
                 raise IOError('It is not possible to create a symlink between two different machines for '
                               'calculation {}'.format(node.pk))
 
+    provenance_exclude_list = calc_info.provenance_exclude_list or []
+
+    # Loop recursively over content of the sandbox folder copying all that are not in `provenance_exclude_list`. Note
+    # that directories are not created explicitly. The `node.put_object_from_filelike` call will create intermediate
+    # directories for nested files automatically when needed. This means though that empty folders in the sandbox or
+    # folders that would be empty when considering the `provenance_exclude_list` will *not* be copied to the repo. The
+    # advantage of this explicit copying instead of deleting the files from `provenance_exclude_list` from the sandbox
+    # first before moving the entire remaining content to the node's repository, is that in this way we are guaranteed
+    # not to accidentally move files to the repository that should not go there at all cost.
+    for root, dirnames, filenames in os.walk(folder.abspath):
+        for filename in filenames:
+            filepath = os.path.join(root, filename)
+            relpath = os.path.relpath(filepath, folder.abspath)
+            if relpath not in provenance_exclude_list:
+                with open(filepath, 'rb') as handle:
+                    node.put_object_from_filelike(handle, relpath, 'wb', force=True)
+
     if not dry_run:
         # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest
         # chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it

diff --git a/docs/source/working/calculations.rst b/docs/source/working/calculations.rst
@@ -141,6 +141,15 @@ The implementation of the ``ArithmeticAddCalculation`` that we are considering i
 Before we go into the code line-by-line, let's describe the big picture of what is happening here.
 The goal of this method is to help the engine accomplish the three steps required for preparing the submission a calculation job, as described above.
 The raw input files that are required can be written to a sandbox folder that is passed in as the ``folder`` argument.
+
+.. note::
+
+    The ``folder`` argument points to a temporary sandbox folder on the local file system that can be used to write the input files to.
+    After the ``prepare_for_submission`` method returns, the engine will take those contents and copy them to the working directory where the calculation will be run.
+    On top of that, these files will also be written to the file repository of the node that represents the calculation as an additional measure of provenance.
+    Even though the information written there should be a derivation of the contents of the nodes that were passed as input nodes, since it is a derived form we store this explicitly nonetheless.
+    Sometimes, this behavior is undesirable, for example for efficiency or data privacy reasons, so it can be controlled with various lists such as :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>` and :ref:`provenance_exclude_list <working_calcjobs_file_lists_provenance_exclude>`.
+
 All the other required information, such as the directives of which files to copy and what command line options to use are defined through the :py:class:`~aiida.common.datastructures.CalcInfo` datastructure, which should be returned from the method as the only value.
 In principle, this is what one **should do** in the ``prepare_for_submission`` method:
 
@@ -244,6 +253,47 @@ If instead, you need to transfer a specific file from a ``FolderData``, you can
 Note that the filenames in the relative source and target path need not be the same.
 This depends fully on how the files are stored in the node's repository and what files need to be written to the working directory.
 
+One might think what the purpose of the list is, when one could just as easily use normal the normal API to write the file to the ``folder`` sandbox folder.
+It is true, that in this way the file will be copied to the working directory, however, then it will *also* be copied into the repository of the calculation node.
+Since in this case it is merely a direct one-to-one copy of the file that is already part of one of the input nodes (in an unaltered form), this duplication is unnecessary and adds useless weight to the file repository.
+Using the ``local_copy_list`` prevents this unnecessary duplication of file content.
+It can also be used if the content of a particular input node is privacy sensitive and cannot be duplicated in the repository.
+
+.. _working_calcjobs_file_lists_provenance_exclude:
+
+Provenance exclude list
+~~~~~~~~~~~~~~~~~~~~~~~
+The :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>`  allows one to instruct the engine to write files from the input files to the working directory, without them *also* being copied to the file repository of the calculation node.
+As discussed in the corresponding section, this is useful in order to avoid duplication or in case where the data of the nodes is proprietary or privacy sensitive and cannot be duplicated arbitrarily everywhere in the file repository.
+However, the limitation of the ``local_copy_list`` is that the it can only target single files in its entirety and cannot be used for arbitrary files that are written to the ``folder`` sandbox folder.
+To provide full control over what files from the ``folder`` are stored permanently in the calculation node file repository, the ``provenance_exclude_list`` is introduced.
+This :py:class:`~aiida.common.datastructures.CalcInfo` attribute is a list of filepaths, relative to the base path of the ``folder`` sandbox folder, which *are not stored* in the file repository.
+
+Consider the following file structure as written by an implementation of ``prepare_for_submission`` to the ``folder`` sandbox:
+
+.. code:: bash
+
+    ├─ sub
+    │  ├─ file_b.txt
+    │  └─ personal.dat
+    ├─ file_a.txt
+    └─ secret.key
+
+Clearly, we do not want the ``personal.dat`` and ``secret.key`` files to end up permanently in the file repository.
+This can be achieved by defining:
+
+.. code:: python
+
+    calc_info.provenance_exclude_list = ['sub/personal.dat', 'secret.key']
+
+With this specification, the final contents of the repository of the calculation node will contain:
+
+.. code:: bash
+
+    ├─ sub
+    │  └─ file_b.txt
+    └─ file_a.txt
+
 .. _working_calcjobs_file_lists_remote_copy:
 
 Remote copy list

diff --git a/tests/engine/test_calc_job.py b/tests/engine/test_calc_job.py
@@ -33,6 +33,43 @@ def raise_exception(exception):
     raise exception()
 
 
+class FileCalcJob(CalcJob):
+    """Example `CalcJob` implementation to test the `provenance_exclude_list` functionality.
+
+    The content of the input `files` will be copied to the `folder` sandbox, but also added to the attribute
+    `provenance_exclude_list` of the `CalcInfo` which should instruct the engine to copy the files to the remote work
+    directory but NOT to the repository of the `CalcJobNode`.
+    """
+
+    @classmethod
+    def define(cls, spec):
+        super().define(spec)
+        spec.input('settings', valid_type=orm.Dict)
+        spec.input_namespace('files', valid_type=orm.SinglefileData, dynamic=True)
+
+    def prepare_for_submission(self, folder):
+        from aiida.common.datastructures import CalcInfo, CodeInfo
+
+        for key, node in self.inputs.files.items():
+            filepath = key.replace('_', os.sep)
+            dirname = os.path.dirname(filepath)
+            basename = os.path.basename(filepath)
+            with node.open(mode='rb') as source:
+                if dirname:
+                    subfolder = folder.get_subfolder(dirname, create=True)
+                    subfolder.create_file_from_filelike(source, basename)
+                else:
+                    folder.create_file_from_filelike(source, filepath)
+
+        codeinfo = CodeInfo()
+        codeinfo.code_uuid = self.inputs.code.uuid
+
+        calcinfo = CalcInfo()
+        calcinfo.codes_info = [codeinfo]
+        calcinfo.provenance_exclude_list = self.inputs.settings.get_attribute('provenance_exclude_list')
+        return calcinfo
+
+
 class TestCalcJob(AiidaTestCase):
     """Test for the `CalcJob` process sub class."""
 
@@ -231,3 +268,50 @@ def test_run_local_code(self):
         # Since the repository will only contain files on the top-level due to `Code.set_files` we only check those
         for filename in self.local_code.list_object_names():
             self.assertTrue(filename in uploaded_files)
+
+    def test_provenance_exclude_list(self):
+        """Test the functionality of the `CalcInfo.provenance_exclude_list` attribute."""
+        import tempfile
+
+        code = orm.Code(input_plugin_name='arithmetic.add', remote_computer_exec=[self.computer, '/bin/true']).store()
+
+        with tempfile.NamedTemporaryFile('w+') as handle:
+            handle.write('dummy_content')
+            handle.flush()
+            file_one = orm.SinglefileData(file=handle.name)
+
+        with tempfile.NamedTemporaryFile('w+') as handle:
+            handle.write('dummy_content')
+            handle.flush()
+            file_two = orm.SinglefileData(file=handle.name)
+
+        inputs = {
+            'code': code,
+            'files': {
+                # Note the `FileCalcJob` will turn underscores in the key into forward slashes making a nested hierarchy
+                'base_a_sub_one': file_one,
+                'base_b_two': file_two,
+            },
+            'settings': orm.Dict(dict={'provenance_exclude_list': ['base/a/sub/one']}),
+            'metadata': {
+                'dry_run': True,
+                'options': {
+                    'resources': {
+                        'num_machines': 1,
+                        'num_mpiprocs_per_machine': 1
+                    }
+                }
+            }
+        }
+
+        # We perform a `dry_run` because the calculation cannot actually run, however, the contents will still be
+        # written to the node's repository so we can check it contains the expected contents.
+        _, node = launch.run_get_node(FileCalcJob, **inputs)
+
+        self.assertIn('folder', node.dry_run_info)
+
+        # Verify that the folder (representing the node's repository) indeed do not contain the input files. Note,
+        # however, that the directory hierarchy should be there, albeit empty
+        self.assertIn('base', node.list_object_names())
+        self.assertEqual(sorted(['b']), sorted(node.list_object_names(os.path.join('base'))))
+        self.assertEqual(['two'], node.list_object_names(os.path.join('base', 'b')))