Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uploading spreadsheet for updates #15

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ venv.bak/
# directory for anything specific only to environment
_local/
venv*/
.venv*/

test/*/*.xlsx

Expand Down
13 changes: 12 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
include:
remote: 'https://raw.githubusercontent.com/ebi-ait/gitlab-ci-templates/master/build-release-deploy.yml'
remote: 'https://raw.githubusercontent.com/ebi-ait/gitlab-ci-templates/master/build-release-deploy.yml'

Unit Test:
image: quay.io/ebi-ait/ingest-base-images:python_3.6-slim
variables:
INGEST_API : https://api.ingest.dev.archive.data.humancellatlas.org/
script:
- apt-get update
- apt-get install -y git
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- nosetests
34 changes: 24 additions & 10 deletions broker/service/spreadsheet_storage/spreadsheet_storage_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def __init__(self, storage_dir, storage_manifest_name="storage_manifest.json"):
self.storage_dir = storage_dir
self.storage_manifest_name = storage_manifest_name

def store(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
def store_submission_spreadsheet(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
"""
Stores a given spreadsheet at path <submission_uuid>/<spreadsheetname>, local to
the storage directory
Expand All @@ -18,20 +18,31 @@ def store(self, submission_uuid, spreadsheet_name, spreadsheet_blob):
:param spreadsheet_blob:
:return:
"""
submission_dir = f'{self.storage_dir}/{submission_uuid}'
submission_dir = self.get_submission_dir(submission_uuid)
try:
os.mkdir(submission_dir)
submission_spreadsheet_path = f'{submission_dir}/{spreadsheet_name}'
storage_manifest_path = f'{submission_dir}/{self.storage_manifest_name}'
with open(submission_spreadsheet_path, "wb") as spreadsheet_file:
spreadsheet_file.write(spreadsheet_blob)
with open(storage_manifest_path, "w") as storage_manfiest:
json.dump({"name": spreadsheet_name, "location": submission_spreadsheet_path}, storage_manfiest)
return submission_spreadsheet_path
submission_spreadsheet_path = self.store_binary_file(submission_dir, spreadsheet_name, spreadsheet_blob)
manifest_content = {"name": spreadsheet_name, "location": submission_spreadsheet_path}
self.store_json_file(submission_dir, self.storage_manifest_name, manifest_content)
return submission_spreadsheet_path
except FileExistsError:
raise SubmissionSpreadsheetAlreadyExists()

def retrieve(self, submission_uuid):
def store_binary_file(self, directory: str, filename:str, blob):
os.makedirs(directory, exist_ok=True)
path = f'{directory}/{filename}'
aaclan-ebi marked this conversation as resolved.
Show resolved Hide resolved
with open( path, "wb") as file:
file.write(blob)
return path

def store_json_file(self, directory: str, filename: str, data: dict):
os.makedirs(directory, exist_ok=True)
path = f'{directory}/{filename}'
with open(path, "w") as json_file:
json.dump(data, json_file)
return path

def retrieve_submission_spreadsheet(self, submission_uuid):
try:
spreadsheet_location = self.get_spreadsheet_location(submission_uuid)
spreadsheet_name = spreadsheet_location["name"]
Expand Down Expand Up @@ -59,3 +70,6 @@ def get_spreadsheet_location(self, submission_uuid):
raise SubmissionSpreadsheetDoesntExist()
else:
return {"name": spreadsheet_name, "path": spreadsheet_path}

def get_submission_dir(self, submission_uuid):
return f'{self.storage_dir}/{submission_uuid}'
57 changes: 43 additions & 14 deletions broker/service/spreadsheet_upload_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import os
import threading
import time

from ingest.api.ingestapi import IngestApi
from ingest.importer.importer import XlsImporter
Expand All @@ -16,28 +18,55 @@ def __init__(self, ingest_api: IngestApi, storage_service: SpreadsheetStorageSer
self.storage_service = storage_service
self.importer = importer

def async_upload(self, token, request_file, is_update, project_uuid=None):
if token is None:
raise SpreadsheetUploadError(401, "An authentication token must be supplied when uploading a spreadsheet")

self.ingest_api.set_token(token)
submission_resource = self.ingest_api.create_submission(update_submission=is_update)
def async_upload(self, token, request_file, is_update, project_uuid=None, submission_uuid=None):
aaclan-ebi marked this conversation as resolved.
Show resolved Hide resolved
self._validate_token_exists(token)
submission_resource = self._create_or_get_submission(submission_uuid)

submission_uuid = submission_resource["uuid"]["uuid"]
submission_url = submission_resource["_links"]["self"]["href"]
filename = secure_filename(request_file.filename)

submission_uuid = submission_resource["uuid"]["uuid"]
path = self.storage_service.store(submission_uuid, filename, request_file.read())
if is_update:
path = self._store_spreadsheet_updates(filename, request_file, submission_uuid)
thread = threading.Thread(target=self.upload_updates, args=(submission_url, path))
else:
path = self.storage_service.store_submission_spreadsheet(submission_uuid, filename, request_file.read())
thread = threading.Thread(target=self.upload, args=(submission_url, path, project_uuid))

thread = threading.Thread(target=self._upload,
args=(submission_resource, path, is_update, project_uuid))
thread.start()

return submission_resource

def _upload(self, submission_resource, path, is_update, project_uuid=None):
def _store_spreadsheet_updates(self, filename, request_file, submission_uuid):
submission_directory = self.storage_service.get_submission_dir(submission_uuid)
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename_with_timestamp = f'{timestamp}_{filename}'
# TODO This spreadsheet containing the updates is not downloadable anywhere yet
path = self.storage_service.store_binary_file(submission_directory, filename_with_timestamp,
request_file.read())
return path

def _create_or_get_submission(self, submission_uuid):
if submission_uuid:
submission_resource = self.ingest_api.get_submission_by_uuid(submission_uuid)
else:
submission_resource = self.ingest_api.create_submission()
return submission_resource

def _validate_token_exists(self, token):
if token is None:
raise SpreadsheetUploadError(401, "An authentication token must be supplied when uploading a spreadsheet")
self.ingest_api.set_token(token)

def upload(self, submission_url, path, project_uuid=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is upload for a full project rather than just updates?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The upload function is for uploading spreadsheets with new entities. Sometime's the project is created before submission so we supply it if it already exists.

_LOGGER.info('Spreadsheet started!')
submission, template_manager = self.importer.import_file(path, submission_url, project_uuid=project_uuid)
self.importer.update_spreadsheet_with_uuids(submission, template_manager, path)
_LOGGER.info('Spreadsheet upload done!')

def upload_updates(self, submission_url, path):
_LOGGER.info('Spreadsheet started!')
submission_url = submission_resource["_links"]["self"]["href"].rsplit("{")[0]
submission, template_manager = self.importer.import_file(path, submission_url, is_update, project_uuid)
self.importer.create_update_spreadsheet(submission, template_manager, path)
self.importer.import_file(path, submission_url, is_update=True)
_LOGGER.info('Spreadsheet upload done!')


Expand Down
4 changes: 2 additions & 2 deletions broker/service/summary_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def add_specific_submission_information(self, submission_summary, submission_ent
return submission_summary

def get_entities_in_submission(self, submission_uri, entity_type) -> Generator[dict, None, None]:
yield from self.ingestapi.getEntities(submission_uri, entity_type, 1000)
yield from self.ingestapi.get_entities(submission_uri, entity_type)

def get_all_entities_in_submission(self, submission_uri):
return SubmissionEntities(list(self.get_entities_in_submission(submission_uri, 'biomaterials')),
Expand All @@ -112,7 +112,7 @@ def get_all_entities_in_submission(self, submission_uri):
list(self.get_entities_in_submission(submission_uri, 'files')))

def get_submissions_in_project(self, project_resource) -> Generator[dict, None, None]:
yield from self.ingestapi.getRelatedEntities('submissionEnvelopes', project_resource, 'submissionEnvelopes')
yield from self.ingestapi.get_related_entities('submissionEnvelopes', project_resource, 'submissionEnvelopes')

@staticmethod
def generate_summary_for_entity(entities) -> EntitySummary:
Expand Down
36 changes: 24 additions & 12 deletions broker_app.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#!/usr/bin/env python

import io
import json
import jsonpickle
import logging
import os
import sys
import traceback
from http import HTTPStatus

import jsonpickle
from flask import Flask, request, redirect, send_file, jsonify
from flask import json
from flask import Flask, request, redirect, send_file, g
from flask_cors import CORS, cross_origin

from http import HTTPStatus

from ingest.api.ingestapi import IngestApi
from ingest.importer.importer import XlsImporter

Expand Down Expand Up @@ -46,9 +48,17 @@

SPREADSHEET_UPLOAD_MESSAGE_ERROR = "We experienced a problem while uploading your spreadsheet"

ingest_api = IngestApi()
spreadsheet_generator = SpreadsheetGenerator(ingest_api)
spreadsheet_job_manager = SpreadsheetJobManager(spreadsheet_generator, SPREADSHEET_STORAGE_DIR)

def setup():
amnonkhen marked this conversation as resolved.
Show resolved Hide resolved
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

global ingest_api
amnonkhen marked this conversation as resolved.
Show resolved Hide resolved
global spreadsheet_generator
global spreadsheet_job_manager

ingest_api = IngestApi()
spreadsheet_generator = SpreadsheetGenerator(ingest_api)
spreadsheet_job_manager = SpreadsheetJobManager(spreadsheet_generator, SPREADSHEET_STORAGE_DIR)


@app.route('/', methods=['GET'])
Expand Down Expand Up @@ -78,7 +88,7 @@ def upload_update_spreadsheet():
@app.route('/submissions/<submission_uuid>/spreadsheet', methods=['GET'])
def get_submission_spreadsheet(submission_uuid):
try:
spreadsheet = SpreadsheetStorageService(SPREADSHEET_STORAGE_DIR).retrieve(submission_uuid)
spreadsheet = SpreadsheetStorageService(SPREADSHEET_STORAGE_DIR).retrieve_submission_spreadsheet(submission_uuid)
spreadsheet_name = spreadsheet["name"]
spreadsheet_blob = spreadsheet["blob"]

Expand Down Expand Up @@ -122,6 +132,8 @@ def submission_summary(submission_uuid):
@cross_origin()
@app.route('/spreadsheets', methods=['POST'])
def create_spreadsheet():
spreadsheet_job_manager = SpreadsheetJobManager(g.spreadsheet_generator, SPREADSHEET_STORAGE_DIR)

request_json = json.loads(request.data)
filename = request_json["filename"]
spreadsheet_spec = SpreadsheetSpec.from_dict(request_json["spec"])
Expand Down Expand Up @@ -239,10 +251,11 @@ def _upload_spreadsheet(is_update=False):
token = request.headers.get('Authorization')
request_file = request.files['file']
project_uuid = request.form.get('projectUuid')
submission_uuid = request.form.get('submissionUuid')

try:
logger.info('Uploading spreadsheet!')
submission_resource = spreadsheet_upload_svc.async_upload(token, request_file, is_update, project_uuid)
submission_resource = spreadsheet_upload_svc.async_upload(token, request_file, is_update, project_uuid, submission_uuid)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The is_update param is just passed through here, and there is subsequent branching logic further down the execution sequence.
This smells of unclean design. Maybe there would be room here for the "strategy" pattern.

Copy link
Contributor Author

@aaclan-ebi aaclan-ebi Jul 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm afraid I don't have time to implement that for this sprint.

logger.info(f'Created Submission: {submission_resource["_links"]["self"]["href"]}')
except SpreadsheetUploadError as error:
return response_json(error.http_code, {
Expand Down Expand Up @@ -287,6 +300,5 @@ def response_json(status_code, data):


if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

app.run(host='0.0.0.0', port=5000)
setup()
app.run(host='0.0.0.0', port=5000)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ itsdangerous==0.24
jdcal==1.4
MarkupSafe==1.1
werkzeug>=0.15.3
-e git+https://github.com/ebi-ait/ingest-client.git@7f04ab9a#egg=hca_ingest
-e git+https://github.com/ebi-ait/ingest-client.git@7038dcb#egg=hca_ingest
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we not using a normal version number? (x.y.z)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was done this way in the early development of the ingest-client python lib. It wasn't changed yet. We should also set up and release build for the ingest-client repo so that it'll release the module to PyPI every merges to master.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I filed a ticket for the ingest-client CI/CD setup: ebi-ait/dcp-ingest-central#385

jsonpickle~=1.2
expiringdict==1.1.4
jsonpath-rw
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from unittest import TestCase, skip
from broker.service.spreadsheet_generation.spreadsheet_generator import SpreadsheetGenerator, SpreadsheetSpec, TypeSpec,\
from broker.service.spreadsheet_generation.spreadsheet_generator import SpreadsheetGenerator, SpreadsheetSpec, TypeSpec, \
LinkSpec, IncludeSomeModules, IncludeAllModules, TemplateTab, TemplateYaml
from ingest.api.ingestapi import IngestApi
from ingest.template.schema_template import SchemaTemplate
Expand All @@ -12,7 +13,11 @@
import tempfile
import yaml


class TestSpreadsheetGenerator(TestCase):
amnonkhen marked this conversation as resolved.
Show resolved Hide resolved
def setUp(self) -> None:
self.temp_dir = tempfile.TemporaryDirectory()
self.test_file = f"{self.temp_dir.name}/ss2.xlsx"

def test_link_column_generation(self):
cell_suspension_spec = TypeSpec("cell_suspension", IncludeAllModules(), False, LinkSpec(["donor_organism"], []))
Expand Down Expand Up @@ -93,7 +98,8 @@ def test_template_tabs_from_parsed_tabs(self):
self.assertTrue("project" in [tab.schema_name for tab in template_tabs])
self.assertTrue("contributors" in [tab.schema_name for tab in template_tabs])
self.assertTrue("Project" in [tab.display_name for tab in template_tabs])
self.assertTrue(any("specimen_from_organism.biomaterial_core.biomaterial_id" in cols for cols in [tab.columns for tab in template_tabs]))
self.assertTrue(any("specimen_from_organism.biomaterial_core.biomaterial_id" in cols for cols in
[tab.columns for tab in template_tabs]))

def test_user_friendly_names(self):
ingest_url = "https://api.ingest.dev.archive.data.humancellatlas.org"
Expand Down Expand Up @@ -195,39 +201,49 @@ def test_generate(self):
"sequencing_protocol"]))])

# check the spreadsheet exists
output_filename = spreadsheet_generator.generate(test_spreadsheet_spec, "ss2.xlsx")
self.assertTrue("ss2.xlsx" in output_filename)
output_filename = spreadsheet_generator.generate(test_spreadsheet_spec, self.test_file)
self.assertTrue(self.test_file in output_filename)

# check the actual tab names equal the expected tab names
xls = pd.ExcelFile("ss2.xlsx")
actual_tab_names = xls.sheet_names
df = pd.read_excel(self.test_file, engine='openpyxl', sheet_name=None)
actual_tab_names = list(df.keys())

# Note: "Project - Funding source(s)" is technically wrong, because the 'user friendly' name needs to be updated in the schema
# from "Funding source(s)" to "Funders" here: https://schema.dev.archive.data.humancellatlas.org/type/project/14.1.0/project

expected_tab_names1 = ["Project", "Project - Contributors", "Project - Publications", "Project - Funding source(s)",
"Donor organism", "Collection protocol", "Specimen from organism", "Organoid", "Cell line",
expected_tab_names1 = ["Project", "Project - Contributors", "Project - Publications",
"Project - Funding source(s)",
"Donor organism", "Collection protocol", "Specimen from organism", "Organoid",
"Cell line",
"Imaged specimen", "Dissociation protocol", "Aggregate generation protocol",
"Differentiation protocol", "Ipsc induction protocol", "Cell suspension",
"Imaging protocol", "Imaging protocol - Channel","Imaging protocol - Probe",
"Imaging protocol", "Imaging protocol - Channel", "Imaging protocol - Probe",
"Imaging preparation protocol", "Image file", "Library preparation protocol",
"Sequencing protocol", "Supplementary file", "Sequence file", "Schemas"]

self.assertEqual(actual_tab_names, expected_tab_names1)

# check the row headers are present and correct.
# note: it is difficult to extend this test to multiple tabs (too many expected row headers to list here). Any ideas/thoughts about how to extend it welcome.
xls = pd.ExcelFile("ss2.xlsx")
df = pd.read_excel(xls, "Project")

df = pd.read_excel(self.test_file, sheet_name='Project', engine='openpyxl')
self.assertEqual(df.columns[0], "PROJECT LABEL (Required)")
self.assertEqual(df.iloc[0,0], "A short name for the project.")
self.assertEqual(df.iloc[2,0], "project.project_core.project_short_name")
self.assertEqual(df.iloc[0, 0], "A short name for the project.")
self.assertEqual(df.iloc[2, 0], "project.project_core.project_short_name")

# note: it is difficult to extend this test to multiple tabs (too many expected column names to list here). Any ideas/thoughts about how to extend it welcome.
expected_col_names = ["project.project_core.project_short_name", "project.project_core.project_title",
"project.project_core.project_description",
"project.estimated_cell_count",
"project.data_use_restrictions.text",
"project.data_use_restrictions.ontology",
"project.data_use_restrictions.ontology_label",
"project.supplementary_links", "project.insdc_project_accessions",
"project.ega_accessions", "project.dbgap_accessions",
"project.geo_series_accessions", "project.array_express_accessions",
"project.insdc_study_accessions", "project.biostudies_accessions"]
actual_col_names = list(df.iloc[2])
self.assertEqual(expected_col_names, actual_col_names)
self.assertEqual(expected_col_names, actual_col_names)

def tearDown(self) -> None:
self.temp_dir.cleanup()
Loading