From 53bbc74ada86d76f78bfb0b5d7bf0e3b73046944 Mon Sep 17 00:00:00 2001
From: Sebastiaan Huber <mail@sphuber.net>
Date: Mon, 10 Feb 2020 18:10:31 +0100
Subject: [PATCH] Add `provenance_exclude_list` attribute to `CalcInfo` data
 structure (#3720)

This new attribute takes a flat list of relative filepaths, which
correspond to files in the `folder` sandbox passed to the
`prepare_for_submission` call of the `CalcJob`, that should not be
copied to the repository of the `CalcJobNode`. This functionality is
useful to avoid the content of input files, that should be copied to the
working directory of the calculation, to also be stored permanently in
the file repository. Example use cases are for very large input files or
files whose content is proprietary. Both use cases could already be
implemented using the `local_copy_list` but only in the case of files of
an input node in its entirety. The syntax of the `local_copy_list` does
not support the exclusion of arbitrary files that are written by the
calculation plugin to the sandbox folder.

Before the addition of this new feature, the contents of the sandbox
folder were added to the repository of the calculation node simply by
moving the contents of the sandbox entirely to the repository. This was
changed to an explicit loop over the contents and only copying those
files that do not appear in the `provenance_exclude_list` list.

The advantage of recursively looping over the contents of the sandbox
folder and *copying* the contents to the repository as long as it is not
part of `provenance_exclude_list`, over deleting those excluded files
from the sandbox before *moving* the remaining content to the
repository, is that in the former there is a better guarantee that the
excluded files do not accidentally end up in the repository due to an
unnoticed problem in the deletion from the sandbox.

The moving method is of course a lot more efficient then copying files
one by one. However, this moving approach is only possible now that the
repository is still implemented on the same filesystem as the sandbox.
Once the new repository interface is fully implemented, where non file
system repositories are also possible, moving the sandbox folder to the
repository will no longer be possible anyway, so it is acceptable to
already make this change now, since it will have to be done at some
point anyway.
---
 aiida/common/datastructures.py       |  7 +++
 aiida/engine/daemon/execmanager.py   | 20 ++++++-
 docs/source/working/calculations.rst | 50 +++++++++++++++++
 tests/engine/test_calc_job.py        | 84 ++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/aiida/common/datastructures.py b/aiida/common/datastructures.py
index 282fc8cee0..61bd147e52 100644
--- a/aiida/common/datastructures.py
+++ b/aiida/common/datastructures.py
@@ -67,6 +67,12 @@ class CalcInfo(DefaultFieldsAttributeDict):
     * local_copy_list: a list of tuples with format ('node_uuid', 'filename', relativedestpath')
     * remote_copy_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
     * remote_symlink_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath')
+    * provenance_exclude_list: a sequence of relative paths of files in the sandbox folder of a `CalcJob` instance that
+        should not be stored permanantly in the repository folder of the corresponding `CalcJobNode` that will be
+        created, but should only be copied to the remote working directory on the target computer. This is useful for
+        input files that should be copied to the working directory but should not be copied as well to the repository
+        either, for example, because they contain proprietary information or because they are big and their content is
+        already indirectly present in the repository through one of the data nodes passed as input to the calculation.
     * codes_info: a list of dictionaries used to pass the info of the execution of a code
     * codes_run_mode: a string used to specify the order in which multi codes can be executed
     """
@@ -91,6 +97,7 @@ class CalcInfo(DefaultFieldsAttributeDict):
         'local_copy_list',
         'remote_copy_list',
         'remote_symlink_list',
+        'provenance_exclude_list',
         'codes_info',
         'codes_run_mode'
     )
diff --git a/aiida/engine/daemon/execmanager.py b/aiida/engine/daemon/execmanager.py
index 740737d108..cd51517103 100644
--- a/aiida/engine/daemon/execmanager.py
+++ b/aiida/engine/daemon/execmanager.py
@@ -62,9 +62,6 @@ def upload_calculation(node, transport, calc_info, folder, inputs=None, dry_run=
         raise ValueError('Cannot submit calculation {} because it has cached input links! If you just want to test the '
                          'submission, set `metadata.dry_run` to True in the inputs.'.format(node.pk))
 
-    # After this call, no modifications to the folder should be done
-    node.put_object_from_tree(folder.abspath, force=True)
-
     # If we are performing a dry-run, the working directory should actually be a local folder that should already exist
     if dry_run:
         workdir = transport.getcwd()
@@ -244,6 +241,23 @@ def find_data_node(inputs, uuid):
                 raise IOError('It is not possible to create a symlink between two different machines for '
                               'calculation {}'.format(node.pk))
 
+    provenance_exclude_list = calc_info.provenance_exclude_list or []
+
+    # Loop recursively over content of the sandbox folder copying all that are not in `provenance_exclude_list`. Note
+    # that directories are not created explicitly. The `node.put_object_from_filelike` call will create intermediate
+    # directories for nested files automatically when needed. This means though that empty folders in the sandbox or
+    # folders that would be empty when considering the `provenance_exclude_list` will *not* be copied to the repo. The
+    # advantage of this explicit copying instead of deleting the files from `provenance_exclude_list` from the sandbox
+    # first before moving the entire remaining content to the node's repository, is that in this way we are guaranteed
+    # not to accidentally move files to the repository that should not go there at all cost.
+    for root, dirnames, filenames in os.walk(folder.abspath):
+        for filename in filenames:
+            filepath = os.path.join(root, filename)
+            relpath = os.path.relpath(filepath, folder.abspath)
+            if relpath not in provenance_exclude_list:
+                with open(filepath, 'rb') as handle:
+                    node.put_object_from_filelike(handle, relpath, 'wb', force=True)
+
     if not dry_run:
         # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest
         # chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it
diff --git a/docs/source/working/calculations.rst b/docs/source/working/calculations.rst
index 8fc5beb60b..4fae92b4ba 100644
--- a/docs/source/working/calculations.rst
+++ b/docs/source/working/calculations.rst
@@ -141,6 +141,15 @@ The implementation of the ``ArithmeticAddCalculation`` that we are considering i
 Before we go into the code line-by-line, let's describe the big picture of what is happening here.
 The goal of this method is to help the engine accomplish the three steps required for preparing the submission a calculation job, as described above.
 The raw input files that are required can be written to a sandbox folder that is passed in as the ``folder`` argument.
+
+.. note::
+
+    The ``folder`` argument points to a temporary sandbox folder on the local file system that can be used to write the input files to.
+    After the ``prepare_for_submission`` method returns, the engine will take those contents and copy them to the working directory where the calculation will be run.
+    On top of that, these files will also be written to the file repository of the node that represents the calculation as an additional measure of provenance.
+    Even though the information written there should be a derivation of the contents of the nodes that were passed as input nodes, since it is a derived form we store this explicitly nonetheless.
+    Sometimes, this behavior is undesirable, for example for efficiency or data privacy reasons, so it can be controlled with various lists such as :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>` and :ref:`provenance_exclude_list <working_calcjobs_file_lists_provenance_exclude>`.
+
 All the other required information, such as the directives of which files to copy and what command line options to use are defined through the :py:class:`~aiida.common.datastructures.CalcInfo` datastructure, which should be returned from the method as the only value.
 In principle, this is what one **should do** in the ``prepare_for_submission`` method:
 
@@ -244,6 +253,47 @@ If instead, you need to transfer a specific file from a ``FolderData``, you can
 Note that the filenames in the relative source and target path need not be the same.
 This depends fully on how the files are stored in the node's repository and what files need to be written to the working directory.
 
+One might think what the purpose of the list is, when one could just as easily use normal the normal API to write the file to the ``folder`` sandbox folder.
+It is true, that in this way the file will be copied to the working directory, however, then it will *also* be copied into the repository of the calculation node.
+Since in this case it is merely a direct one-to-one copy of the file that is already part of one of the input nodes (in an unaltered form), this duplication is unnecessary and adds useless weight to the file repository.
+Using the ``local_copy_list`` prevents this unnecessary duplication of file content.
+It can also be used if the content of a particular input node is privacy sensitive and cannot be duplicated in the repository.
+
+.. _working_calcjobs_file_lists_provenance_exclude:
+
+Provenance exclude list
+~~~~~~~~~~~~~~~~~~~~~~~
+The :ref:`local_copy_list <working_calcjobs_file_lists_local_copy>`  allows one to instruct the engine to write files from the input files to the working directory, without them *also* being copied to the file repository of the calculation node.
+As discussed in the corresponding section, this is useful in order to avoid duplication or in case where the data of the nodes is proprietary or privacy sensitive and cannot be duplicated arbitrarily everywhere in the file repository.
+However, the limitation of the ``local_copy_list`` is that the it can only target single files in its entirety and cannot be used for arbitrary files that are written to the ``folder`` sandbox folder.
+To provide full control over what files from the ``folder`` are stored permanently in the calculation node file repository, the ``provenance_exclude_list`` is introduced.
+This :py:class:`~aiida.common.datastructures.CalcInfo` attribute is a list of filepaths, relative to the base path of the ``folder`` sandbox folder, which *are not stored* in the file repository.
+
+Consider the following file structure as written by an implementation of ``prepare_for_submission`` to the ``folder`` sandbox:
+
+.. code:: bash
+
+    ├─ sub
+    │  ├─ file_b.txt
+    │  └─ personal.dat
+    ├─ file_a.txt
+    └─ secret.key
+
+Clearly, we do not want the ``personal.dat`` and ``secret.key`` files to end up permanently in the file repository.
+This can be achieved by defining:
+
+.. code:: python
+
+    calc_info.provenance_exclude_list = ['sub/personal.dat', 'secret.key']
+
+With this specification, the final contents of the repository of the calculation node will contain:
+
+.. code:: bash
+
+    ├─ sub
+    │  └─ file_b.txt
+    └─ file_a.txt
+
 .. _working_calcjobs_file_lists_remote_copy:
 
 Remote copy list
diff --git a/tests/engine/test_calc_job.py b/tests/engine/test_calc_job.py
index 16b3ef58f9..62092e35c1 100644
--- a/tests/engine/test_calc_job.py
+++ b/tests/engine/test_calc_job.py
@@ -33,6 +33,43 @@ def raise_exception(exception):
     raise exception()
 
 
+class FileCalcJob(CalcJob):
+    """Example `CalcJob` implementation to test the `provenance_exclude_list` functionality.
+
+    The content of the input `files` will be copied to the `folder` sandbox, but also added to the attribute
+    `provenance_exclude_list` of the `CalcInfo` which should instruct the engine to copy the files to the remote work
+    directory but NOT to the repository of the `CalcJobNode`.
+    """
+
+    @classmethod
+    def define(cls, spec):
+        super().define(spec)
+        spec.input('settings', valid_type=orm.Dict)
+        spec.input_namespace('files', valid_type=orm.SinglefileData, dynamic=True)
+
+    def prepare_for_submission(self, folder):
+        from aiida.common.datastructures import CalcInfo, CodeInfo
+
+        for key, node in self.inputs.files.items():
+            filepath = key.replace('_', os.sep)
+            dirname = os.path.dirname(filepath)
+            basename = os.path.basename(filepath)
+            with node.open(mode='rb') as source:
+                if dirname:
+                    subfolder = folder.get_subfolder(dirname, create=True)
+                    subfolder.create_file_from_filelike(source, basename)
+                else:
+                    folder.create_file_from_filelike(source, filepath)
+
+        codeinfo = CodeInfo()
+        codeinfo.code_uuid = self.inputs.code.uuid
+
+        calcinfo = CalcInfo()
+        calcinfo.codes_info = [codeinfo]
+        calcinfo.provenance_exclude_list = self.inputs.settings.get_attribute('provenance_exclude_list')
+        return calcinfo
+
+
 class TestCalcJob(AiidaTestCase):
     """Test for the `CalcJob` process sub class."""
 
@@ -231,3 +268,50 @@ def test_run_local_code(self):
         # Since the repository will only contain files on the top-level due to `Code.set_files` we only check those
         for filename in self.local_code.list_object_names():
             self.assertTrue(filename in uploaded_files)
+
+    def test_provenance_exclude_list(self):
+        """Test the functionality of the `CalcInfo.provenance_exclude_list` attribute."""
+        import tempfile
+
+        code = orm.Code(input_plugin_name='arithmetic.add', remote_computer_exec=[self.computer, '/bin/true']).store()
+
+        with tempfile.NamedTemporaryFile('w+') as handle:
+            handle.write('dummy_content')
+            handle.flush()
+            file_one = orm.SinglefileData(file=handle.name)
+
+        with tempfile.NamedTemporaryFile('w+') as handle:
+            handle.write('dummy_content')
+            handle.flush()
+            file_two = orm.SinglefileData(file=handle.name)
+
+        inputs = {
+            'code': code,
+            'files': {
+                # Note the `FileCalcJob` will turn underscores in the key into forward slashes making a nested hierarchy
+                'base_a_sub_one': file_one,
+                'base_b_two': file_two,
+            },
+            'settings': orm.Dict(dict={'provenance_exclude_list': ['base/a/sub/one']}),
+            'metadata': {
+                'dry_run': True,
+                'options': {
+                    'resources': {
+                        'num_machines': 1,
+                        'num_mpiprocs_per_machine': 1
+                    }
+                }
+            }
+        }
+
+        # We perform a `dry_run` because the calculation cannot actually run, however, the contents will still be
+        # written to the node's repository so we can check it contains the expected contents.
+        _, node = launch.run_get_node(FileCalcJob, **inputs)
+
+        self.assertIn('folder', node.dry_run_info)
+
+        # Verify that the folder (representing the node's repository) indeed do not contain the input files. Note,
+        # however, that the directory hierarchy should be there, albeit empty
+        self.assertIn('base', node.list_object_names())
+        self.assertEqual(sorted(['b']), sorted(node.list_object_names(os.path.join('base'))))
+        self.assertEqual(['two'], node.list_object_names(os.path.join('base', 'b')))