ebi-ait · aaclan-ebi · Jul 6, 2021 · Jul 6, 2021 · Jul 6, 2021 · Jul 6, 2021
diff --git a/.gitignore b/.gitignore
@@ -109,6 +109,7 @@ venv.bak/
 # directory for anything specific only to environment
 _local/
 venv*/
+.venv*/
 
 test/*/*.xlsx
 

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,2 +1,13 @@
 include:
-  remote: 'https://raw.githubusercontent.com/ebi-ait/gitlab-ci-templates/master/build-release-deploy.yml'
+  remote: 'https://raw.githubusercontent.com/ebi-ait/gitlab-ci-templates/master/build-release-deploy.yml'
+
+Unit Test:
+  image: quay.io/ebi-ait/ingest-base-images:python_3.6-slim
+  variables:
+    INGEST_API : https://api.ingest.dev.archive.data.humancellatlas.org/
+  script:
+    - apt-get update
+    - apt-get install -y git
+    - pip install -r requirements.txt
+    - pip install -r requirements-dev.txt
+    - nosetests
diff --git a/broker/service/spreadsheet_storage/spreadsheet_storage_service.py b/broker/service/spreadsheet_storage/spreadsheet_storage_service.py
@@ -9,7 +9,7 @@ def __init__(self, storage_dir, storage_manifest_name="storage_manifest.json"):
         self.storage_dir = storage_dir
         self.storage_manifest_name = storage_manifest_name
 
-    def store(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
+    def store_submission_spreadsheet(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
         """
         Stores a given spreadsheet at path <submission_uuid>/<spreadsheetname>, local to
         the storage directory
@@ -18,20 +18,31 @@ def store(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
         :param spreadsheet_blob:
         :return:
         """
-        submission_dir = f'{self.storage_dir}/{submission_uuid}'
+        submission_dir = self.get_submission_dir(submission_uuid)
         try:
             os.mkdir(submission_dir)
-            submission_spreadsheet_path = f'{submission_dir}/{spreadsheet_name}'
-            storage_manifest_path = f'{submission_dir}/{self.storage_manifest_name}'
-            with open(submission_spreadsheet_path, "wb") as spreadsheet_file:
-                spreadsheet_file.write(spreadsheet_blob)
-                with open(storage_manifest_path, "w") as storage_manfiest:
-                    json.dump({"name": spreadsheet_name, "location": submission_spreadsheet_path}, storage_manfiest)
-                    return submission_spreadsheet_path
+            submission_spreadsheet_path = self.store_binary_file(submission_dir, spreadsheet_name, spreadsheet_blob)
+            manifest_content = {"name": spreadsheet_name, "location": submission_spreadsheet_path}
+            self.store_json_file(submission_dir, self.storage_manifest_name, manifest_content)
+            return submission_spreadsheet_path
         except FileExistsError:
             raise SubmissionSpreadsheetAlreadyExists()
 
-    def retrieve(self, submission_uuid):
+    def store_binary_file(self, directory: str, filename:str, blob):
+        os.makedirs(directory, exist_ok=True)
+        path = f'{directory}/{filename}'
+        with open( path, "wb") as file:
+            file.write(blob)
+        return path
+
+    def store_json_file(self, directory: str, filename: str, data: dict):
+        os.makedirs(directory, exist_ok=True)
+        path = f'{directory}/{filename}'
+        with open(path, "w") as json_file:
+            json.dump(data, json_file)
+        return path
+
+    def retrieve_submission_spreadsheet(self, submission_uuid):
         try:
             spreadsheet_location = self.get_spreadsheet_location(submission_uuid)
             spreadsheet_name = spreadsheet_location["name"]
@@ -59,3 +70,6 @@ def get_spreadsheet_location(self, submission_uuid):
                         raise SubmissionSpreadsheetDoesntExist()
                     else:
                         return {"name": spreadsheet_name, "path": spreadsheet_path}
+
+    def get_submission_dir(self, submission_uuid):
+        return f'{self.storage_dir}/{submission_uuid}'
diff --git a/broker/service/spreadsheet_upload_service.py b/broker/service/spreadsheet_upload_service.py
@@ -1,5 +1,7 @@
 import logging
+import os
 import threading
+import time
 
 from ingest.api.ingestapi import IngestApi
 from ingest.importer.importer import XlsImporter
@@ -16,28 +18,55 @@ def __init__(self, ingest_api: IngestApi, storage_service: SpreadsheetStorageSer
         self.storage_service = storage_service
         self.importer = importer
 
-    def async_upload(self, token, request_file, is_update, project_uuid=None):
-        if token is None:
-            raise SpreadsheetUploadError(401, "An authentication token must be supplied when uploading a spreadsheet")
-
-        self.ingest_api.set_token(token)
-        submission_resource = self.ingest_api.create_submission(update_submission=is_update)
+    def async_upload(self, token, request_file, is_update, project_uuid=None, submission_uuid=None):
+        self._validate_token_exists(token)
+        submission_resource = self._create_or_get_submission(submission_uuid)
 
+        submission_uuid = submission_resource["uuid"]["uuid"]
+        submission_url = submission_resource["_links"]["self"]["href"]
         filename = secure_filename(request_file.filename)
 
-        submission_uuid = submission_resource["uuid"]["uuid"]
-        path = self.storage_service.store(submission_uuid, filename, request_file.read())
+        if is_update:
+            path = self._store_spreadsheet_updates(filename, request_file, submission_uuid)
+            thread = threading.Thread(target=self.upload_updates, args=(submission_url, path))
+        else:
+            path = self.storage_service.store_submission_spreadsheet(submission_uuid, filename, request_file.read())
+            thread = threading.Thread(target=self.upload, args=(submission_url, path, project_uuid))
 
-        thread = threading.Thread(target=self._upload,
-                                  args=(submission_resource, path, is_update, project_uuid))
         thread.start()
+
         return submission_resource
 
-    def _upload(self, submission_resource, path, is_update, project_uuid=None):
+    def _store_spreadsheet_updates(self, filename, request_file, submission_uuid):
+        submission_directory = self.storage_service.get_submission_dir(submission_uuid)
+        timestamp = time.strftime("%Y%m%d-%H%M%S")
+        filename_with_timestamp = f'{timestamp}_{filename}'
+        # TODO This spreadsheet containing the updates is not downloadable anywhere yet
+        path = self.storage_service.store_binary_file(submission_directory, filename_with_timestamp,
+                                                      request_file.read())
+        return path
+
+    def _create_or_get_submission(self, submission_uuid):
+        if submission_uuid:
+            submission_resource = self.ingest_api.get_submission_by_uuid(submission_uuid)
+        else:
+            submission_resource = self.ingest_api.create_submission()
+        return submission_resource
+
+    def _validate_token_exists(self, token):
+        if token is None:
+            raise SpreadsheetUploadError(401, "An authentication token must be supplied when uploading a spreadsheet")
+        self.ingest_api.set_token(token)
+
+    def upload(self, submission_url, path, project_uuid=None):
+        _LOGGER.info('Spreadsheet started!')
+        submission, template_manager = self.importer.import_file(path, submission_url, project_uuid=project_uuid)
+        self.importer.update_spreadsheet_with_uuids(submission, template_manager, path)
+        _LOGGER.info('Spreadsheet upload done!')
+
+    def upload_updates(self, submission_url, path):
         _LOGGER.info('Spreadsheet started!')
-        submission_url = submission_resource["_links"]["self"]["href"].rsplit("{")[0]
-        submission, template_manager = self.importer.import_file(path, submission_url, is_update, project_uuid)
-        self.importer.create_update_spreadsheet(submission, template_manager, path)
+        self.importer.import_file(path, submission_url, is_update=True)
         _LOGGER.info('Spreadsheet upload done!')
 
 

diff --git a/broker/service/summary_service.py b/broker/service/summary_service.py
@@ -102,7 +102,7 @@ def add_specific_submission_information(self, submission_summary, submission_ent
         return submission_summary
 
     def get_entities_in_submission(self, submission_uri, entity_type) -> Generator[dict, None, None]:
-        yield from self.ingestapi.getEntities(submission_uri, entity_type, 1000)
+        yield from self.ingestapi.get_entities(submission_uri, entity_type)
 
     def get_all_entities_in_submission(self, submission_uri):
         return SubmissionEntities(list(self.get_entities_in_submission(submission_uri, 'biomaterials')),
@@ -112,7 +112,7 @@ def get_all_entities_in_submission(self, submission_uri):
                                   list(self.get_entities_in_submission(submission_uri, 'files')))
 
     def get_submissions_in_project(self, project_resource) -> Generator[dict, None, None]:
-        yield from self.ingestapi.getRelatedEntities('submissionEnvelopes', project_resource, 'submissionEnvelopes')
+        yield from self.ingestapi.get_related_entities('submissionEnvelopes', project_resource, 'submissionEnvelopes')
 
     @staticmethod
     def generate_summary_for_entity(entities) -> EntitySummary:

diff --git a/broker_app.py b/broker_app.py
@@ -1,16 +1,18 @@
 #!/usr/bin/env python
 
 import io
+import json
+import jsonpickle
 import logging
 import os
 import sys
 import traceback
-from http import HTTPStatus
 
-import jsonpickle
-from flask import Flask, request, redirect, send_file, jsonify
-from flask import json
+from flask import Flask, request, redirect, send_file, g
 from flask_cors import CORS, cross_origin
+
+from http import HTTPStatus
+
 from ingest.api.ingestapi import IngestApi
 from ingest.importer.importer import XlsImporter
 
@@ -46,9 +48,17 @@
 
 SPREADSHEET_UPLOAD_MESSAGE_ERROR = "We experienced a problem while uploading your spreadsheet"
 
-ingest_api = IngestApi()
-spreadsheet_generator = SpreadsheetGenerator(ingest_api)
-spreadsheet_job_manager = SpreadsheetJobManager(spreadsheet_generator, SPREADSHEET_STORAGE_DIR)
+
+def setup():
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    global ingest_api
+    global spreadsheet_generator
+    global spreadsheet_job_manager
+
+    ingest_api = IngestApi()
+    spreadsheet_generator = SpreadsheetGenerator(ingest_api)
+    spreadsheet_job_manager = SpreadsheetJobManager(spreadsheet_generator, SPREADSHEET_STORAGE_DIR)
 
 
 @app.route('/', methods=['GET'])
@@ -78,7 +88,7 @@ def upload_update_spreadsheet():
 @app.route('/submissions/<submission_uuid>/spreadsheet', methods=['GET'])
 def get_submission_spreadsheet(submission_uuid):
     try:
-        spreadsheet = SpreadsheetStorageService(SPREADSHEET_STORAGE_DIR).retrieve(submission_uuid)
+        spreadsheet = SpreadsheetStorageService(SPREADSHEET_STORAGE_DIR).retrieve_submission_spreadsheet(submission_uuid)
         spreadsheet_name = spreadsheet["name"]
         spreadsheet_blob = spreadsheet["blob"]
 
@@ -122,6 +132,8 @@ def submission_summary(submission_uuid):
 @cross_origin()
 @app.route('/spreadsheets', methods=['POST'])
 def create_spreadsheet():
+    spreadsheet_job_manager = SpreadsheetJobManager(g.spreadsheet_generator, SPREADSHEET_STORAGE_DIR)
+
     request_json = json.loads(request.data)
     filename = request_json["filename"]
     spreadsheet_spec = SpreadsheetSpec.from_dict(request_json["spec"])
@@ -239,10 +251,11 @@ def _upload_spreadsheet(is_update=False):
     token = request.headers.get('Authorization')
     request_file = request.files['file']
     project_uuid = request.form.get('projectUuid')
+    submission_uuid = request.form.get('submissionUuid')
 
     try:
         logger.info('Uploading spreadsheet!')
-        submission_resource = spreadsheet_upload_svc.async_upload(token, request_file, is_update, project_uuid)
+        submission_resource = spreadsheet_upload_svc.async_upload(token, request_file, is_update, project_uuid, submission_uuid)
         logger.info(f'Created Submission: {submission_resource["_links"]["self"]["href"]}')
     except SpreadsheetUploadError as error:
         return response_json(error.http_code, {
@@ -287,6 +300,5 @@ def response_json(status_code, data):
 
 
 if __name__ == '__main__':
-    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-    app.run(host='0.0.0.0', port=5000)
+    setup()
+    app.run(host='0.0.0.0', port=5000)
diff --git a/requirements.txt b/requirements.txt
@@ -8,7 +8,7 @@ itsdangerous==0.24
 jdcal==1.4
 MarkupSafe==1.1
 werkzeug>=0.15.3
--e git+https://github.com/ebi-ait/ingest-client.git@7f04ab9a#egg=hca_ingest
+-e git+https://github.com/ebi-ait/ingest-client.git@7038dcb#egg=hca_ingest
 jsonpickle~=1.2
 expiringdict==1.1.4
 jsonpath-rw

diff --git a/test/common/__init__.py → test/integration/__init__.py b/test/common/__init__.py → test/integration/__init__.py
diff --git a/test/service/test_spreadsheet_generator.py → ...integration/test_spreadsheet_generator.py b/test/service/test_spreadsheet_generator.py → ...integration/test_spreadsheet_generator.py
@@ -1,5 +1,6 @@
+import os
 from unittest import TestCase, skip
-from broker.service.spreadsheet_generation.spreadsheet_generator import SpreadsheetGenerator, SpreadsheetSpec, TypeSpec,\
+from broker.service.spreadsheet_generation.spreadsheet_generator import SpreadsheetGenerator, SpreadsheetSpec, TypeSpec, \
     LinkSpec, IncludeSomeModules, IncludeAllModules, TemplateTab, TemplateYaml
 from ingest.api.ingestapi import IngestApi
 from ingest.template.schema_template import SchemaTemplate
@@ -12,7 +13,11 @@
 import tempfile
 import yaml
 
+
 class TestSpreadsheetGenerator(TestCase):
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.test_file = f"{self.temp_dir.name}/ss2.xlsx"
 
     def test_link_column_generation(self):
         cell_suspension_spec = TypeSpec("cell_suspension", IncludeAllModules(), False, LinkSpec(["donor_organism"], []))
@@ -93,7 +98,8 @@ def test_template_tabs_from_parsed_tabs(self):
         self.assertTrue("project" in [tab.schema_name for tab in template_tabs])
         self.assertTrue("contributors" in [tab.schema_name for tab in template_tabs])
         self.assertTrue("Project" in [tab.display_name for tab in template_tabs])
-        self.assertTrue(any("specimen_from_organism.biomaterial_core.biomaterial_id" in cols for cols in [tab.columns for tab in template_tabs]))
+        self.assertTrue(any("specimen_from_organism.biomaterial_core.biomaterial_id" in cols for cols in
+                            [tab.columns for tab in template_tabs]))
 
     def test_user_friendly_names(self):
         ingest_url = "https://api.ingest.dev.archive.data.humancellatlas.org"
@@ -195,39 +201,49 @@ def test_generate(self):
                                                                              "sequencing_protocol"]))])
 
         # check the spreadsheet exists
-        output_filename = spreadsheet_generator.generate(test_spreadsheet_spec, "ss2.xlsx")
-        self.assertTrue("ss2.xlsx" in output_filename)
+        output_filename = spreadsheet_generator.generate(test_spreadsheet_spec, self.test_file)
+        self.assertTrue(self.test_file in output_filename)
 
         # check the actual tab names equal the expected tab names
-        xls = pd.ExcelFile("ss2.xlsx")
-        actual_tab_names = xls.sheet_names
+        df = pd.read_excel(self.test_file, engine='openpyxl', sheet_name=None)
+        actual_tab_names = list(df.keys())
 
         # Note: "Project - Funding source(s)" is technically wrong, because the 'user friendly' name needs to be updated in the schema
         # from "Funding source(s)" to "Funders" here: https://schema.dev.archive.data.humancellatlas.org/type/project/14.1.0/project
 
-        expected_tab_names1 = ["Project", "Project - Contributors", "Project - Publications", "Project - Funding source(s)",
-                               "Donor organism", "Collection protocol", "Specimen from organism", "Organoid", "Cell line",
+        expected_tab_names1 = ["Project", "Project - Contributors", "Project - Publications",
+                               "Project - Funding source(s)",
+                               "Donor organism", "Collection protocol", "Specimen from organism", "Organoid",
+                               "Cell line",
                                "Imaged specimen", "Dissociation protocol", "Aggregate generation protocol",
                                "Differentiation protocol", "Ipsc induction protocol", "Cell suspension",
-                               "Imaging protocol", "Imaging protocol - Channel","Imaging protocol - Probe",
+                               "Imaging protocol", "Imaging protocol - Channel", "Imaging protocol - Probe",
                                "Imaging preparation protocol", "Image file", "Library preparation protocol",
                                "Sequencing protocol", "Supplementary file", "Sequence file", "Schemas"]
 
         self.assertEqual(actual_tab_names, expected_tab_names1)
 
         # check the row headers are present and correct.
         # note: it is difficult to extend this test to multiple tabs (too many expected row headers to list here). Any ideas/thoughts about how to extend it welcome.
-        xls = pd.ExcelFile("ss2.xlsx")
-        df = pd.read_excel(xls, "Project")
+
+        df = pd.read_excel(self.test_file, sheet_name='Project', engine='openpyxl')
         self.assertEqual(df.columns[0], "PROJECT LABEL (Required)")
-        self.assertEqual(df.iloc[0,0], "A short name for the project.")
-        self.assertEqual(df.iloc[2,0], "project.project_core.project_short_name")
+        self.assertEqual(df.iloc[0, 0], "A short name for the project.")
+        self.assertEqual(df.iloc[2, 0], "project.project_core.project_short_name")
 
         # note: it is difficult to extend this test to multiple tabs (too many expected column names to list here). Any ideas/thoughts about how to extend it welcome.
         expected_col_names = ["project.project_core.project_short_name", "project.project_core.project_title",
                               "project.project_core.project_description",
+                              "project.estimated_cell_count",
+                              "project.data_use_restrictions.text",
+                              "project.data_use_restrictions.ontology",
+                              "project.data_use_restrictions.ontology_label",
                               "project.supplementary_links", "project.insdc_project_accessions",
+                              "project.ega_accessions", "project.dbgap_accessions",
                               "project.geo_series_accessions", "project.array_express_accessions",
                               "project.insdc_study_accessions", "project.biostudies_accessions"]
         actual_col_names = list(df.iloc[2])
-        self.assertEqual(expected_col_names, actual_col_names)
+        self.assertEqual(expected_col_names, actual_col_names)
+
+    def tearDown(self) -> None:
+        self.temp_dir.cleanup()