-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
#13 Spreadsheet download #22
Changes from all commits
afc8e27
578b89e
357cd6c
f09205d
f606b50
eeee548
5072a1c
db17cce
6fd08d7
fb4938b
a16c966
37aa895
29e1834
74b103b
8168a9f
9259ef0
99470a4
c0efbac
2399420
7d0b653
a10cf5b
bb7b2ce
34c3271
456a034
72fbb24
d000bc5
600a913
b567a7c
6e82048
ba9bc9a
03b5336
7f3ba1a
8fb0560
63ecd2b
321c5b8
2b9bcd1
7abbdcf
6269a3d
0807fec
78e732d
e757244
dd56ad4
e782e6d
099e5fc
781007f
da14c58
9342eb6
2ccc5bd
96cf299
0b4d5a8
0c89235
b0fba31
09d7a8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from ingest.api.ingestapi import IngestApi | ||
|
||
|
||
class DataCollector: | ||
|
||
def __init__(self, ingest_api: IngestApi): | ||
self.api = ingest_api | ||
|
||
def collect_data_by_submission_uuid(self, submission_uuid): | ||
submission = self.api.get_submission_by_uuid(submission_uuid) | ||
submission_id = submission['_links']['self']['href'].split('/')[-1] | ||
project_json = self.api.get_related_project(submission_id) | ||
|
||
if project_json: | ||
data_by_submission = [ | ||
project_json | ||
] | ||
else: | ||
raise Exception('There should be a project') | ||
|
||
self.__get_entities_by_submission_and_type(data_by_submission, submission, 'biomaterials') | ||
self.__get_entities_by_submission_and_type(data_by_submission, submission, 'processes') | ||
self.__get_entities_by_submission_and_type(data_by_submission, submission, 'protocols') | ||
self.__get_entities_by_submission_and_type(data_by_submission, submission, 'files') | ||
|
||
return data_by_submission | ||
|
||
def __get_entities_by_submission_and_type(self, data_by_submission, submission, entity_type): | ||
entity_json = \ | ||
self.api.get_related_entities(entity_type, submission, entity_type) | ||
if entity_json: | ||
data_by_submission.extend(list(entity_json)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from typing import List | ||
|
||
from openpyxl import Workbook | ||
from openpyxl.worksheet.worksheet import Worksheet | ||
|
||
from ingest.downloader.flattener import Flattener | ||
from ingest.importer.spreadsheet.ingest_worksheet import START_DATA_ROW | ||
|
||
HEADER_ROW_NO = 4 | ||
|
||
|
||
class XlsDownloader: | ||
def __init__(self): | ||
self.flattener = Flattener() | ||
|
||
def convert_json(self, metadata_list: List[dict]): | ||
return self.flattener.flatten(metadata_list) | ||
|
||
def create_workbook(self, input_json: dict) -> Workbook: | ||
workbook = Workbook() | ||
workbook.remove(workbook.active) | ||
|
||
for ws_title, ws_elements in input_json.items(): | ||
if ws_title == 'Project': | ||
worksheet: Worksheet = workbook.create_sheet(title=ws_title, index=0) | ||
else: | ||
worksheet: Worksheet = workbook.create_sheet(title=ws_title) | ||
|
||
self.add_worksheet_content(worksheet, ws_elements) | ||
|
||
return workbook | ||
|
||
def add_worksheet_content(self, worksheet, ws_elements: dict): | ||
headers = ws_elements.get('headers') | ||
self.__add_header_row(worksheet, headers) | ||
all_values = ws_elements.get('values') | ||
|
||
for row_number, row_values in enumerate(all_values, start=START_DATA_ROW): | ||
self.__add_row_content(worksheet, headers, row_number, row_values) | ||
|
||
@staticmethod | ||
def __add_header_row(worksheet, headers: list): | ||
for col, header in enumerate(headers, start=1): | ||
worksheet.cell(row=HEADER_ROW_NO, column=col, value=header) | ||
|
||
@staticmethod | ||
def __add_row_content(worksheet, headers: list, row_number: int, values: dict): | ||
for header, value in values.items(): | ||
index = headers.index(header) | ||
worksheet.cell(row=row_number, column=index + 1, value=value) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
from typing import List | ||
|
||
MODULE_WORKSHEET_NAME_CONNECTOR = ' - ' | ||
SCALAR_LIST_DELIMETER = '||' | ||
|
||
ONTOLOGY_REQUIRED_PROPS = ['ontology', 'ontology_label'] | ||
EXCLUDE_KEYS = ['describedBy', 'schema_type'] | ||
|
||
|
||
class Flattener: | ||
def __init__(self): | ||
self.workbook = {} | ||
|
||
def flatten(self, entity_list: List[dict], object_key: str = ''): | ||
for entity in entity_list: | ||
self._flatten_entity(entity, object_key) | ||
return self.workbook | ||
|
||
def _flatten_entity(self, entity, object_key): | ||
worksheet_name = object_key | ||
row = {} | ||
content = entity | ||
|
||
if not object_key: | ||
content = entity['content'] | ||
worksheet_name = self._get_concrete_entity(content) | ||
row = {f'{worksheet_name}.uuid': entity['uuid']['uuid']} | ||
|
||
if not worksheet_name: | ||
raise Exception('There should be a worksheet name') | ||
|
||
self._flatten_object(content, row, parent_key=worksheet_name) | ||
|
||
user_friendly_worksheet_name = self._format_worksheet_name(worksheet_name) | ||
worksheet = self.workbook.get(user_friendly_worksheet_name, {'headers': [], 'values': []}) | ||
|
||
rows = self._append_row_to_worksheet(row, worksheet) | ||
headers = self._update_headers(row, worksheet) | ||
|
||
self.workbook[user_friendly_worksheet_name] = { | ||
'headers': headers, | ||
'values': rows | ||
} | ||
|
||
def _append_row_to_worksheet(self, row, worksheet): | ||
rows = worksheet.get('values') | ||
rows.append(row) | ||
return rows | ||
|
||
def _update_headers(self, row, worksheet): | ||
headers = worksheet.get('headers') | ||
for key in row.keys(): | ||
if key not in headers: | ||
headers.append(key) | ||
return headers | ||
|
||
def _flatten_object(self, object: dict, flattened_object: dict, parent_key: str = ''): | ||
if isinstance(object, dict): | ||
for key in object: | ||
if key in EXCLUDE_KEYS: | ||
continue | ||
|
||
value = object[key] | ||
full_key = f'{parent_key}.{key}' if parent_key else key | ||
if isinstance(value, dict) or isinstance(value, list): | ||
self._flatten_object(value, flattened_object, parent_key=full_key) | ||
else: | ||
flattened_object[full_key] = str(value) | ||
elif isinstance(object, list): | ||
self._flatten_list(flattened_object, object, parent_key) | ||
|
||
def _flatten_list(self, flattened_object, object, parent_key): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are some of these methods returning values and some modifying properties on the object without returning it? In general, I think it's bad practice for a function to modify an object since it can cause unexpected side effects. I think it would be better to copy the object, change it, and return it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are private functions so it shouldn't be a big concern. They are needed to be modified because it's part of the recursive logic. I'm not yet sure how to achieve it with your suggestion to copy the object, change it, and return it. I will think about it. Could we leave that for later? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah no problem. I was just curious more than anything. Being private makes sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with @jacobwindsor that pure functions are an ideal to aspire to, since side0effects can be a source of surprise and therefore bugs. Sometimes, if we make an explicit decision to have side-effects, the function's name should make it clear. |
||
if self._is_list_of_objects(object): | ||
self._flatten_object_list(flattened_object, object, parent_key) | ||
else: | ||
self._flatten_scalar_list(flattened_object, object, parent_key) | ||
|
||
def _flatten_scalar_list(self, flattened_object, object, parent_key): | ||
stringified = [str(e) for e in object] | ||
flattened_object[parent_key] = SCALAR_LIST_DELIMETER.join(stringified) | ||
|
||
def _flatten_object_list(self, flattened_object: dict, object: dict, parent_key: str): | ||
if self._is_list_of_ontology_objects(object): | ||
self._flatten_ontology_list(object, flattened_object, parent_key) | ||
else: | ||
self.flatten(object, parent_key) | ||
|
||
def _flatten_ontology_list(self, object: dict, flattened_object: dict, parent_key: str): | ||
keys = self._get_keys_of_a_list_of_object(object) | ||
for key in keys: | ||
flattened_object[f'{parent_key}.{key}'] = SCALAR_LIST_DELIMETER.join([elem[key] for elem in object]) | ||
|
||
def _format_worksheet_name(self, worksheet_name): | ||
names = worksheet_name.split('.') | ||
names = [n.replace('_', ' ') for n in names] | ||
new_worksheet_name = MODULE_WORKSHEET_NAME_CONNECTOR.join([n.capitalize() for n in names]) | ||
return new_worksheet_name | ||
|
||
def _is_list_of_objects(self, content): | ||
return content and isinstance(content[0], dict) | ||
|
||
def _is_list_of_ontology_objects(self, object: dict): | ||
first_elem = object[0] if object else {} | ||
result = [prop in first_elem for prop in ONTOLOGY_REQUIRED_PROPS] | ||
return all(result) | ||
|
||
def _get_keys_of_a_list_of_object(self, object: dict): | ||
first_elem = object[0] if object else {} | ||
return list(first_elem.keys()) | ||
|
||
def _get_concrete_entity(self, content: dict): | ||
return content.get('describedBy').rsplit('/', 1)[-1] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
{ | ||
"Project": { | ||
"headers": [ | ||
"project.uuid", | ||
"project.project_core.project_short_name", | ||
"project.project_core.project_title", | ||
"project.project_core.project_description", | ||
"project.insdc_project_accessions", | ||
"project.geo_series_accessions", | ||
"project.insdc_study_accessions" | ||
], | ||
"values": [ | ||
{ | ||
"project.uuid": "3e329187-a9c4-48ec-90e3-cc45f7c2311c", | ||
"project.project_core.project_short_name": "kriegsteinBrainOrganoids", | ||
"project.project_core.project_title": "Establishing Cerebral Organoids as Models of Human-Specific Brain Evolution", | ||
"project.project_core.project_description": "Direct comparisons of human and non-human primate brain tissue have the potential to reveal molecular pathways underlying remarkable specializations of the human brain. However, chimpanzee tissue is largely inaccessible during neocortical neurogenesis when differences in brain size first appear. To identify human-specific features of cortical development, we leveraged recent innovations that permit generating pluripotent stem cell-derived cerebral organoids from chimpanzee. First, we systematically evaluated the fidelity of organoid models to primary human and macaque cortex, finding organoid models preserve gene regulatory networks related to cell types and developmental processes but exhibit increased metabolic stress. Second, we identified 261 genes differentially expressed in human compared to chimpanzee organoids and macaque cortex. Many of these genes overlap with human-specific segmental duplications and a subset suggest increased PI3K/AKT/mTOR activation in human outer radial glia. Together, our findings establish a platform for systematic analysis of molecular changes contributing to human brain development and evolution. Overall design: Single cell mRNA sequencing of iPS-derived neural and glial progenitor cells using the Fluidigm C1 system This series includes re-analysis of publicly available data in accessions: phs000989.v3, GSE99951, GSE86207, GSE75140. Sample metadata and accession IDs for the re-analyzed samples are included in the file \"GSE124299_metadata_on_processed_samples.xlsx\" available on the foot of this record. The following samples have no raw data due to data loss: GSM3569728, GSM3569738, GSM3571601, GSM3571606, GSM3571615, GSM3571621, GSM3571625, and GSM3571631", | ||
"project.insdc_project_accessions": "SRP180337", | ||
"project.geo_series_accessions": "GSE124299", | ||
"project.insdc_study_accessions": "PRJNA515930" | ||
} | ||
] | ||
}, | ||
"Project - Contributors": { | ||
"headers": [ | ||
"project.contributors.name", | ||
"project.contributors.email", | ||
"project.contributors.institution", | ||
"project.contributors.laboratory", | ||
"project.contributors.country", | ||
"project.contributors.corresponding_contributor", | ||
"project.contributors.project_role.text", | ||
"project.contributors.project_role.ontology", | ||
"project.contributors.project_role.ontology_label" | ||
], | ||
"values": [ | ||
{ | ||
"project.contributors.name": "Alex A,,Pollen", | ||
"project.contributors.email": "[email protected]", | ||
"project.contributors.institution": "University of California, San Francisco (UCSF)", | ||
"project.contributors.laboratory": "Department of Neurology", | ||
"project.contributors.country": "USA", | ||
"project.contributors.corresponding_contributor": "True", | ||
"project.contributors.project_role.text": "experimental scientist", | ||
"project.contributors.project_role.ontology": "EFO:0009741", | ||
"project.contributors.project_role.ontology_label": "experimental scientist" | ||
}, | ||
{ | ||
"project.contributors.name": "Parisa,,Nejad", | ||
"project.contributors.email": "[email protected]", | ||
"project.contributors.institution": "University of California, Santa Cruz", | ||
"project.contributors.laboratory": "Human Cell Atlas Data Coordination Platform", | ||
"project.contributors.country": "USA", | ||
"project.contributors.corresponding_contributor": "False", | ||
"project.contributors.project_role.text": "data wrangler", | ||
"project.contributors.project_role.ontology": "EFO:0009737", | ||
"project.contributors.project_role.ontology_label": "data curator" | ||
}, | ||
{ | ||
"project.contributors.name": "Schwartz,,Rachel", | ||
"project.contributors.email": "[email protected]", | ||
"project.contributors.institution": "University of California, Santa Cruz", | ||
"project.contributors.laboratory": "Human Cell Atlas Data Coordination Platform", | ||
"project.contributors.country": "USA", | ||
"project.contributors.corresponding_contributor": "False", | ||
"project.contributors.project_role.text": "data wrangler", | ||
"project.contributors.project_role.ontology": "EFO:0009737", | ||
"project.contributors.project_role.ontology_label": "data curator" | ||
} | ||
] | ||
}, | ||
"Project - Publications": { | ||
"headers": [ | ||
"project.publications.authors", | ||
"project.publications.title", | ||
"project.publications.doi", | ||
"project.publications.pmid", | ||
"project.publications.url" | ||
], | ||
"values": [ | ||
{ | ||
"project.publications.authors": "Pollen AA||Bhaduri A||Andrews MG||Nowakowski TJ||Meyerson OS||Mostajo-Radji MA||Di Lullo E||Alvarado B||Bedolli M||Dougherty ML||Fiddes IT||Kronenberg ZN||Shuga J||Leyrat AA||West JA||Bershteyn M||Lowe CB||Pavlovic BJ||Salama SR||Haussler D||Eichler EE||Kriegstein AR", | ||
"project.publications.title": "Establishing Cerebral Organoids as Models of Human-Specific Brain Evolution.", | ||
"project.publications.doi": "10.1016/j.cell.2019.01.017", | ||
"project.publications.pmid": "30735633", | ||
"project.publications.url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6544371/" | ||
} | ||
] | ||
}, | ||
"Project - Funders": { | ||
"headers": [ | ||
"project.funders.grant_id", | ||
"project.funders.organization" | ||
], | ||
"values": [ | ||
{ | ||
"project.funders.grant_id": "U01 MH105989", | ||
"project.funders.organization": "NIMH NIH HHS" | ||
}, | ||
{ | ||
"project.funders.grant_id": "R35 NS097305", | ||
"project.funders.organization": "NINDS NIH HHS" | ||
}, | ||
{ | ||
"project.funders.grant_id": "T32 HD007470", | ||
"project.funders.organization": "NICHD NIH HHS" | ||
}, | ||
{ | ||
"project.funders.grant_id": "T32 GM007266", | ||
"project.funders.organization": "NIGMS NIH HHS" | ||
}, | ||
{ | ||
"project.funders.grant_id": "F32 NS103266", | ||
"project.funders.organization": "NINDS NIH HHS" | ||
}, | ||
{ | ||
"project.funders.grant_id": "NA", | ||
"project.funders.organization": "Howard Hughes Medical Institute" | ||
}, | ||
{ | ||
"project.funders.grant_id": "P51 OD011132", | ||
"project.funders.organization": "NIH HHS" | ||
} | ||
] | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should there be an
else
path as well that raises an exception to protect client from unsupported use?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's no need for that, if you check where this function is being called, it can only be called if it's a dict or list.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could it be called by someone else using different arguments?