Skip to content

Commit

Permalink
Generate Avro schema from AnVIL schema (#6109)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc authored and achave11-ucsc committed Jun 10, 2024
1 parent 1fd4713 commit 437574f
Show file tree
Hide file tree
Showing 3 changed files with 760 additions and 112 deletions.
86 changes: 86 additions & 0 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from collections import (
defaultdict,
)
from operator import (
itemgetter,
)
from typing import (
Iterable,
Optional,
Expand All @@ -9,6 +12,7 @@
)

from azul import (
JSON,
config,
iif,
)
Expand Down Expand Up @@ -38,6 +42,9 @@
DonorTransformer,
FileTransformer,
)
from azul.plugins.metadata.anvil.schema import (
anvil_schema,
)
from azul.plugins.metadata.anvil.service.aggregation import (
AnvilAggregationStage,
AnvilSummaryAggregationStage,
Expand All @@ -49,10 +56,14 @@
AnvilSearchResponseStage,
AnvilSummaryResponseStage,
)
from azul.service.avro_pfb import (
avro_pfb_schema,
)
from azul.service.manifest_service import (
ManifestFormat,
)
from azul.types import (
AnyMutableJSON,
MutableJSON,
)

Expand Down Expand Up @@ -290,6 +301,81 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
recurse(self._field_mapping, ())
return result

def verbatim_pfb_schema(self,
replicas: Iterable[JSON]
) -> tuple[Iterable[JSON], Sequence[str], JSON]:
entity_schemas = []
entity_types = []
for table_schema in sorted(anvil_schema['tables'], key=itemgetter('name')):
table_name = table_schema['name']
is_duos_type = table_name == 'anvil_dataset'
entity_types.append(table_name)
field_schemas = [
self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='datarepo_row_id',
anvil_datatype='string',
is_optional=False,
is_polymorphic=is_duos_type)
]
if is_duos_type:
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='description',
anvil_datatype='string',
is_polymorphic=True))
elif table_name == 'anvil_file':
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='drs_uri',
anvil_datatype='string'))
for column_schema in table_schema['columns']:
field_schemas.append(
self._pfb_schema_from_anvil_column(table_name=table_name,
column_name=column_schema['name'],
anvil_datatype=column_schema['datatype'],
is_array=column_schema['array_of'],
is_optional=not column_schema['required'],
is_polymorphic=is_duos_type)
)

field_schemas.sort(key=itemgetter('name'))
entity_schemas.append({
'name': table_name,
'type': 'record',
'fields': field_schemas
})
return replicas, entity_types, avro_pfb_schema(entity_schemas)

def _pfb_schema_from_anvil_column(self,
*,
table_name: str,
column_name: str,
anvil_datatype: str,
is_array: bool = False,
is_optional: bool = True,
is_polymorphic: bool = False
) -> AnyMutableJSON:
_anvil_to_pfb_types = {
'boolean': 'boolean',
'float': 'double',
'integer': 'long',
'string': 'string',
'fileref': 'string'
}
type_ = _anvil_to_pfb_types[anvil_datatype]
if is_optional:
type_ = ['null', type_]
if is_array:
type_ = {
'type': 'array',
'items': type_
}
if is_polymorphic and (is_array or not is_optional):
type_ = ['null', type_]
return {
'name': column_name,
'namespace': table_name,
'type': type_,
}

def document_slice(self, entity_type: str) -> Optional[DocumentSlice]:
return None

Expand Down
81 changes: 53 additions & 28 deletions test/service/data/verbatim/pfb_entities.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,34 @@
"object": {
"misc": {},
"nodes": [
{
"links": [],
"name": "anvil_activity",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_alignmentactivity",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_antibody",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_assayactivity",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_biosample",
Expand Down Expand Up @@ -40,12 +68,26 @@
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_project",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_sequencingactivity",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "anvil_variantcallingactivity",
"ontology_reference": "",
"properties": [],
"values": {}
}
]
},
Expand All @@ -65,8 +107,7 @@
"principal_investigator": null,
"registered_identifier": null,
"source_datarepo_row_ids": null,
"title": null,
"version": "2022-06-01T00:00:00.000000Z"
"title": null
},
"relations": []
},
Expand All @@ -87,8 +128,7 @@
],
"used_biosample_id": [
"f9d40cf6-37b8-22f3-ce35-0dc614d2452b"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand All @@ -113,16 +153,14 @@
],
"source_datarepo_row_ids": [
"sample:98048c3b-2525-4090-94fd-477de31f2608"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
{
"id": "6b0f6c0f-5d80-4242-accb-840921351cd5",
"name": "anvil_file",
"object": {
"crc32": "",
"data_modality": [],
"datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5",
"drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1fab11f5-7eab-4318-9a58-68d8d06e0715",
Expand All @@ -134,11 +172,9 @@
"file_size": 15079345,
"is_supplementary": true,
"reference_assembly": [],
"sha256": "",
"source_datarepo_row_ids": [
"file_inventory:04ff3af2-0543-4ea6-830a-d31b957fa2ee"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand All @@ -164,8 +200,7 @@
],
"source_datarepo_row_ids": [
"subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand All @@ -186,8 +221,7 @@
],
"used_biosample_id": [
"f9d40cf6-37b8-22f3-ce35-0dc614d2452b"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand All @@ -213,8 +247,7 @@
],
"source_datarepo_row_ids": [
"subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand Down Expand Up @@ -242,16 +275,14 @@
"source_datarepo_row_ids": [
"workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc"
],
"title": "ANVIL_CMG_UWASH_DS_BDIS",
"version": "2022-06-01T00:00:00.000000Z"
"title": "ANVIL_CMG_UWASH_DS_BDIS"
},
"relations": []
},
{
"id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
"name": "anvil_file",
"object": {
"crc32": "",
"data_modality": [],
"datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
"drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67",
Expand All @@ -263,11 +294,9 @@
"file_size": 213021639,
"is_supplementary": false,
"reference_assembly": [],
"sha256": "",
"source_datarepo_row_ids": [
"file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
Expand All @@ -286,16 +315,14 @@
],
"source_datarepo_row_ids": [
"subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
},
{
"id": "3b17377b-16b1-431c-9967-e5d01fc5923f",
"name": "anvil_file",
"object": {
"crc32": "",
"data_modality": [],
"datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f",
"drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37",
Expand All @@ -307,11 +334,9 @@
"file_size": 3306845592,
"is_supplementary": false,
"reference_assembly": [],
"sha256": "",
"source_datarepo_row_ids": [
"file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2"
],
"version": "2022-06-01T00:00:00.000000Z"
]
},
"relations": []
}
Expand Down
Loading

0 comments on commit 437574f

Please sign in to comment.