ebi-ait · ke4 · Jul 21, 2021 · Jul 8, 2021 · Jul 9, 2021 · Jul 9, 2021
diff --git a/ingest/api/ingestapi.py b/ingest/api/ingestapi.py
@@ -121,13 +121,20 @@ def getSubmissions(self):
         if r.status_code == requests.codes.ok:
             return json.loads(r.text)["_embedded"]["submissionEnvelopes"]
 
-    def getProjects(self, id):
-        submissionUrl = self.url + '/submissionEnvelopes/' + id + '/projects'
-        r = self.get(submissionUrl, headers=self.get_headers())
-        projects = []
+    def get_projects(self, submission_id):
+        return self.__get_projects_by_submission_id_and_type(submission_id, 'projects')
+
+    def get_related_project(self, submission_id):
+        projects = self.__get_projects_by_submission_id_and_type(submission_id, 'relatedProjects')
+        return projects[0] if projects else None
+
+    def __get_projects_by_submission_id_and_type(self, submission_id, project_type):
+        submission_url = f'{self.url}/submissionEnvelopes/{submission_id}/{project_type}'
+        r = self.get(submission_url, headers=self.get_headers())
+        projects = {}
         if r.status_code == requests.codes.ok:
             projects = json.loads(r.text)
-        return projects
+        return projects.get('_embedded', {}).get('projects', [])
 
     def get_project_by_id(self, id):
         submission_url = self.url + '/projects/' + id

diff --git a/ingest/downloader/data_collector.py b/ingest/downloader/data_collector.py
@@ -0,0 +1,32 @@
+from ingest.api.ingestapi import IngestApi
+
+
+class DataCollector:
+
+    def __init__(self, ingest_api: IngestApi):
+        self.api = ingest_api
+
+    def collect_data_by_submission_uuid(self, submission_uuid):
+        submission = self.api.get_submission_by_uuid(submission_uuid)
+        submission_id = submission['_links']['self']['href'].split('/')[-1]
+        project_json = self.api.get_related_project(submission_id)
+
+        if project_json:
+            data_by_submission = [
+                project_json
+            ]
+        else:
+            raise Exception('There should be a project')
+
+        self.__get_entities_by_submission_and_type(data_by_submission, submission, 'biomaterials')
+        self.__get_entities_by_submission_and_type(data_by_submission, submission, 'processes')
+        self.__get_entities_by_submission_and_type(data_by_submission, submission, 'protocols')
+        self.__get_entities_by_submission_and_type(data_by_submission, submission, 'files')
+
+        return data_by_submission
+
+    def __get_entities_by_submission_and_type(self, data_by_submission, submission, entity_type):
+        entity_json = \
+            self.api.get_related_entities(entity_type, submission, entity_type)
+        if entity_json:
+            data_by_submission.extend(list(entity_json))
diff --git a/ingest/downloader/downloader.py b/ingest/downloader/downloader.py
@@ -0,0 +1,50 @@
+from typing import List
+
+from openpyxl import Workbook
+from openpyxl.worksheet.worksheet import Worksheet
+
+from ingest.downloader.flattener import Flattener
+from ingest.importer.spreadsheet.ingest_worksheet import START_DATA_ROW
+
+HEADER_ROW_NO = 4
+
+
+class XlsDownloader:
+    def __init__(self):
+        self.flattener = Flattener()
+
+    def convert_json(self, metadata_list: List[dict]):
+        return self.flattener.flatten(metadata_list)
+
+    def create_workbook(self, input_json: dict) -> Workbook:
+        workbook = Workbook()
+        workbook.remove(workbook.active)
+
+        for ws_title, ws_elements in input_json.items():
+            if ws_title == 'Project':
+                worksheet: Worksheet = workbook.create_sheet(title=ws_title, index=0)
+            else:
+                worksheet: Worksheet = workbook.create_sheet(title=ws_title)
+
+            self.add_worksheet_content(worksheet, ws_elements)
+
+        return workbook
+
+    def add_worksheet_content(self, worksheet, ws_elements: dict):
+        headers = ws_elements.get('headers')
+        self.__add_header_row(worksheet, headers)
+        all_values = ws_elements.get('values')
+
+        for row_number, row_values in enumerate(all_values, start=START_DATA_ROW):
+            self.__add_row_content(worksheet, headers, row_number, row_values)
+
+    @staticmethod
+    def __add_header_row(worksheet, headers: list):
+        for col, header in enumerate(headers, start=1):
+            worksheet.cell(row=HEADER_ROW_NO, column=col, value=header)
+
+    @staticmethod
+    def __add_row_content(worksheet, headers: list, row_number: int, values: dict):
+        for header, value in values.items():
+            index = headers.index(header)
+            worksheet.cell(row=row_number, column=index + 1, value=value)
diff --git a/ingest/downloader/flattener.py b/ingest/downloader/flattener.py
@@ -0,0 +1,112 @@
+from typing import List
+
+MODULE_WORKSHEET_NAME_CONNECTOR = ' - '
+SCALAR_LIST_DELIMETER = '||'
+
+ONTOLOGY_REQUIRED_PROPS = ['ontology', 'ontology_label']
+EXCLUDE_KEYS = ['describedBy', 'schema_type']
+
+
+class Flattener:
+    def __init__(self):
+        self.workbook = {}
+
+    def flatten(self, entity_list: List[dict], object_key: str = ''):
+        for entity in entity_list:
+            self._flatten_entity(entity, object_key)
+        return self.workbook
+
+    def _flatten_entity(self, entity, object_key):
+        worksheet_name = object_key
+        row = {}
+        content = entity
+
+        if not object_key:
+            content = entity['content']
+            worksheet_name = self._get_concrete_entity(content)
+            row = {f'{worksheet_name}.uuid': entity['uuid']['uuid']}
+
+        if not worksheet_name:
+            raise Exception('There should be a worksheet name')
+
+        self._flatten_object(content, row, parent_key=worksheet_name)
+
+        user_friendly_worksheet_name = self._format_worksheet_name(worksheet_name)
+        worksheet = self.workbook.get(user_friendly_worksheet_name, {'headers': [], 'values': []})
+
+        rows = self._append_row_to_worksheet(row, worksheet)
+        headers = self._update_headers(row, worksheet)
+
+        self.workbook[user_friendly_worksheet_name] = {
+            'headers': headers,
+            'values': rows
+        }
+
+    def _append_row_to_worksheet(self, row, worksheet):
+        rows = worksheet.get('values')
+        rows.append(row)
+        return rows
+
+    def _update_headers(self, row, worksheet):
+        headers = worksheet.get('headers')
+        for key in row.keys():
+            if key not in headers:
+                headers.append(key)
+        return headers
+
+    def _flatten_object(self, object: dict, flattened_object: dict, parent_key: str = ''):
+        if isinstance(object, dict):
+            for key in object:
+                if key in EXCLUDE_KEYS:
+                    continue
+
+                value = object[key]
+                full_key = f'{parent_key}.{key}' if parent_key else key
+                if isinstance(value, dict) or isinstance(value, list):
+                    self._flatten_object(value, flattened_object, parent_key=full_key)
+                else:
+                    flattened_object[full_key] = str(value)
+        elif isinstance(object, list):
+            self._flatten_list(flattened_object, object, parent_key)
+
+    def _flatten_list(self, flattened_object, object, parent_key):
+        if self._is_list_of_objects(object):
+            self._flatten_object_list(flattened_object, object, parent_key)
+        else:
+            self._flatten_scalar_list(flattened_object, object, parent_key)
+
+    def _flatten_scalar_list(self, flattened_object, object, parent_key):
+        stringified = [str(e) for e in object]
+        flattened_object[parent_key] = SCALAR_LIST_DELIMETER.join(stringified)
+
+    def _flatten_object_list(self, flattened_object: dict, object: dict, parent_key: str):
+        if self._is_list_of_ontology_objects(object):
+            self._flatten_ontology_list(object, flattened_object, parent_key)
+        else:
+            self.flatten(object, parent_key)
+
+    def _flatten_ontology_list(self, object: dict, flattened_object: dict, parent_key: str):
+        keys = self._get_keys_of_a_list_of_object(object)
+        for key in keys:
+            flattened_object[f'{parent_key}.{key}'] = SCALAR_LIST_DELIMETER.join([elem[key] for elem in object])
+
+    def _format_worksheet_name(self, worksheet_name):
+        names = worksheet_name.split('.')
+        names = [n.replace('_', ' ') for n in names]
+        new_worksheet_name = MODULE_WORKSHEET_NAME_CONNECTOR.join([n.capitalize() for n in names])
+        return new_worksheet_name
+
+    def _is_list_of_objects(self, content):
+        return content and isinstance(content[0], dict)
+
+    def _is_list_of_ontology_objects(self, object: dict):
+        first_elem = object[0] if object else {}
+        result = [prop in first_elem for prop in ONTOLOGY_REQUIRED_PROPS]
+        return all(result)
+
+    def _get_keys_of_a_list_of_object(self, object: dict):
+        first_elem = object[0] if object else {}
+        return list(first_elem.keys())
+
+    def _get_concrete_entity(self, content: dict):
+        return content.get('describedBy').rsplit('/', 1)[-1]
diff --git a/ingest/importer/importer.py b/ingest/importer/importer.py
@@ -187,7 +187,6 @@ def __init__(self, template_mgr):
     def do_import(self, workbook: IngestWorkbook, is_update, project_uuid=None):
         registry = _ImportRegistry(self.template_mgr)
         importable_worksheets = workbook.importable_worksheets()
-
         workbook_errors = self.validate_worksheets(is_update, importable_worksheets)
 
         if project_uuid:

diff --git a/tests/unit/downloader/project-list-flattened.json b/tests/unit/downloader/project-list-flattened.json
@@ -0,0 +1,126 @@
+{
+  "Project": {
+    "headers": [
+      "project.uuid",
+      "project.project_core.project_short_name",
+      "project.project_core.project_title",
+      "project.project_core.project_description",
+      "project.insdc_project_accessions",
+      "project.geo_series_accessions",
+      "project.insdc_study_accessions"
+    ],
+    "values": [
+      {
+        "project.uuid": "3e329187-a9c4-48ec-90e3-cc45f7c2311c",
+        "project.project_core.project_short_name": "kriegsteinBrainOrganoids",
+        "project.project_core.project_title": "Establishing Cerebral Organoids as Models of Human-Specific Brain Evolution",
+        "project.project_core.project_description": "Direct comparisons of human and non-human primate brain tissue have the potential to reveal molecular pathways underlying remarkable specializations of the human brain. However, chimpanzee tissue is largely inaccessible during neocortical neurogenesis when differences in brain size first appear. To identify human-specific features of cortical development, we leveraged recent innovations that permit generating pluripotent stem cell-derived cerebral organoids from chimpanzee. First, we systematically evaluated the fidelity of organoid models to primary human and macaque cortex, finding organoid models preserve gene regulatory networks related to cell types and developmental processes but exhibit increased metabolic stress. Second, we identified 261 genes differentially expressed in human compared to chimpanzee organoids and macaque cortex. Many of these genes overlap with human-specific segmental duplications and a subset suggest increased PI3K/AKT/mTOR activation in human outer radial glia. Together, our findings establish a platform for systematic analysis of molecular changes contributing to human brain development and evolution. Overall design: Single cell mRNA sequencing of iPS-derived neural and glial progenitor cells using the Fluidigm C1 system This series includes re-analysis of publicly available data in accessions: phs000989.v3, GSE99951, GSE86207, GSE75140. Sample metadata and accession IDs for the re-analyzed samples are included in the file \"GSE124299_metadata_on_processed_samples.xlsx\" available on the foot of this record. The following samples have no raw data due to data loss: GSM3569728, GSM3569738, GSM3571601, GSM3571606, GSM3571615, GSM3571621, GSM3571625, and GSM3571631",
+        "project.insdc_project_accessions": "SRP180337",
+        "project.geo_series_accessions": "GSE124299",
+        "project.insdc_study_accessions": "PRJNA515930"
+      }
+    ]
+  },
+  "Project - Contributors": {
+    "headers": [
+      "project.contributors.name",
+      "project.contributors.email",
+      "project.contributors.institution",
+      "project.contributors.laboratory",
+      "project.contributors.country",
+      "project.contributors.corresponding_contributor",
+      "project.contributors.project_role.text",
+      "project.contributors.project_role.ontology",
+      "project.contributors.project_role.ontology_label"
+    ],
+    "values": [
+      {
+        "project.contributors.name": "Alex A,,Pollen",
+        "project.contributors.email": "[email protected]",
+        "project.contributors.institution": "University of California, San Francisco (UCSF)",
+        "project.contributors.laboratory": "Department of Neurology",
+        "project.contributors.country": "USA",
+        "project.contributors.corresponding_contributor": "True",
+        "project.contributors.project_role.text": "experimental scientist",
+        "project.contributors.project_role.ontology": "EFO:0009741",
+        "project.contributors.project_role.ontology_label": "experimental scientist"
+      },
+      {
+        "project.contributors.name": "Parisa,,Nejad",
+        "project.contributors.email": "[email protected]",
+        "project.contributors.institution": "University of California, Santa Cruz",
+        "project.contributors.laboratory": "Human Cell Atlas Data Coordination Platform",
+        "project.contributors.country": "USA",
+        "project.contributors.corresponding_contributor": "False",
+        "project.contributors.project_role.text": "data wrangler",
+        "project.contributors.project_role.ontology": "EFO:0009737",
+        "project.contributors.project_role.ontology_label": "data curator"
+      },
+      {
+        "project.contributors.name": "Schwartz,,Rachel",
+        "project.contributors.email": "[email protected]",
+        "project.contributors.institution": "University of California, Santa Cruz",
+        "project.contributors.laboratory": "Human Cell Atlas Data Coordination Platform",
+        "project.contributors.country": "USA",
+        "project.contributors.corresponding_contributor": "False",
+        "project.contributors.project_role.text": "data wrangler",
+        "project.contributors.project_role.ontology": "EFO:0009737",
+        "project.contributors.project_role.ontology_label": "data curator"
+      }
+    ]
+  },
+  "Project - Publications": {
+    "headers": [
+      "project.publications.authors",
+      "project.publications.title",
+      "project.publications.doi",
+      "project.publications.pmid",
+      "project.publications.url"
+    ],
+    "values": [
+      {
+        "project.publications.authors": "Pollen AA||Bhaduri A||Andrews MG||Nowakowski TJ||Meyerson OS||Mostajo-Radji MA||Di Lullo E||Alvarado B||Bedolli M||Dougherty ML||Fiddes IT||Kronenberg ZN||Shuga J||Leyrat AA||West JA||Bershteyn M||Lowe CB||Pavlovic BJ||Salama SR||Haussler D||Eichler EE||Kriegstein AR",
+        "project.publications.title": "Establishing Cerebral Organoids as Models of Human-Specific Brain Evolution.",
+        "project.publications.doi": "10.1016/j.cell.2019.01.017",
+        "project.publications.pmid": "30735633",
+        "project.publications.url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6544371/"
+      }
+    ]
+  },
+  "Project - Funders": {
+    "headers": [
+      "project.funders.grant_id",
+      "project.funders.organization"
+    ],
+    "values": [
+      {
+        "project.funders.grant_id": "U01 MH105989",
+        "project.funders.organization": "NIMH NIH HHS"
+      },
+      {
+        "project.funders.grant_id": "R35 NS097305",
+        "project.funders.organization": "NINDS NIH HHS"
+      },
+      {
+        "project.funders.grant_id": "T32 HD007470",
+        "project.funders.organization": "NICHD NIH HHS"
+      },
+      {
+        "project.funders.grant_id": "T32 GM007266",
+        "project.funders.organization": "NIGMS NIH HHS"
+      },
+      {
+        "project.funders.grant_id": "F32 NS103266",
+        "project.funders.organization": "NINDS NIH HHS"
+      },
+      {
+        "project.funders.grant_id": "NA",
+        "project.funders.organization": "Howard Hughes Medical Institute"
+      },
+      {
+        "project.funders.grant_id": "P51 OD011132",
+        "project.funders.organization": "NIH HHS"
+      }
+    ]
+  }
+}