-
Notifications
You must be signed in to change notification settings - Fork 4
/
ClinTrials_parser_v01.py
104 lines (86 loc) · 3.86 KB
/
ClinTrials_parser_v01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env conda run -n ct_extract_env python
import pandas as pd
import os
import json
def parse_edges(data_folder):
filename = "ClinTrials_KG_edges_v01_3.csv"
filepath = os.path.join(data_folder, filename)
edges = pd.read_csv(filepath, sep='\t')
edges.rename(columns={'subject': 'disease', 'object': 'intervention', 'subject_name': 'disease_name', 'object_name': 'intervention_name'}, inplace=True)
edges.rename(columns={'disease': 'object', 'intervention': 'subject', 'disease_name': 'object_name', 'intervention_name': 'subject_name'}, inplace=True)
for index, row in edges.iterrows():
id_dict = {}
subject_dict = {}
association_dict = {}
object_dict = {}
source_dict = {}
id_dict["_id"] = "{}_{}_{}".format(row["nctid"].split("NCT")[1], row["subject"].split(':')[1], row["object"].split(':')[1])
subject_dict["{}".format(row["subject"].split(':')[0])] = "{}".format(row["subject"].split(':')[1])
subject_dict["name"] = row["subject_name"]
# subject_dict["{}_semantic_types".format(row["subject"].split(':')[0])] = "TBD" # fix in next version
subject_dict["type"] = "biolink:Treatment"
association_dict["predicate"] = "{}".format(row["predicate"].split(':')[1])
association_dict["edge_attributes"] = []
association_dict["edge_attributes"].append(
{"attribute_type_id":"clinicaltrials_id",
"value":row["nctid"]
}
)
association_dict["edge_attributes"].append(
{"attribute_type_id":"biolink:aggregator_knowledge_source",
"value":"infores:aact"}
)
association_dict["edge_attributes"].append(
{"attribute_type_id": "biolink:primary_knowledge_source",
"value": "infores:clinicaltrials"}
)
association_dict["edge_attributes"].append(
{"attribute_type_id": "biolink:aggregator_knowledge_source",
"value": "infores:biothings-multiomics-clinicaltrials"})
association_dict["edge_attributes"].append(
{
"attribute_type_id": "biolink:knowledge_level",
"value": "biolink:knowledge_assertion"
}
)
association_dict["edge_attributes"].append(
{
"attribute_type_id": "biolink:agent_type",
"value": "biolink:text_mining_agent"
}
)
object_dict["{}".format(row["object"].split(':')[0])] = "{}".format(row["object"].split(':')[1])
object_dict["name"] = row["object_name"]
object_dict["type"] = "biolink:DiseaseorPhenotypicFeature"
# object_dict["{}_semantic_types".format(row["object"].split(':')[0])] = "TBD" # fix in next version
source_dict["edge_sources"] = []
source_dict["edge_sources"].append(
{
"resource_id": "infores:biothings-multiomics-clinicaltrials",
"resource_role": "aggregator_knowledge_source"
}
)
source_dict["edge_sources"].append(
{
"resource_id": "infores:aact",
"resource_role": "aggregator_knowledge_source"
}
)
source_dict["edge_sources"].append(
{
"resource_id": "infores:clinicaltrials",
"resource_role": "primary_knowledge_source"
}
)
id_dict["subject"] = subject_dict
id_dict["association"] = association_dict
id_dict["object"] = object_dict
id_dict["source"] = source_dict
# print(json.dumps(id_dict,sort_keys=True, indent=2))
# yield the JSON one by one
yield id_dict # comment for testing
def main():
# data_folder = "../outputs/version_1" # uncomment for testing
parse_edges(data_folder) # uncomment for testing
if __name__ == "__main__":
main()