Skip to content

Commit

Permalink
Updated gene_info transform with many columns renamed as specified in…
Browse files Browse the repository at this point in the history
… AG-1161
  • Loading branch information
jaclynbeck-sage committed Aug 23, 2023
1 parent fef64d8 commit 40d75ef
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 29 deletions.
14 changes: 8 additions & 6 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@
column_rename:
ensg: ensembl_gene_id
ensembl_id: ensembl_gene_id
geneid: ensembl_gene_id
has_eqtl: is_eqtl
minimumlogcpm: min
quartile1logcpm: first_quartile
medianlogcpm: median
meanlogcpm: mean
quartile3logcpm: third_quartile
maximumlogcpm: max
provenance:
- syn25953363.6
- syn12514826.4
Expand All @@ -145,13 +153,7 @@
- syn44151254.1
- syn51942280.2
agora_rename:
has_eqtl: haseqtl
is_igap: isIGAP
symbol: hgnc_symbol
protein_in_ad_brain_change: isAnyProteinChangedInADBrain
rna_in_ad_brain_change: isAnyRNAChangedInADBrain
median_expression: medianexpression
nominated_target: nominatedtarget
destination: *dest

- team_info:
Expand Down
33 changes: 16 additions & 17 deletions src/agoradatatools/etl/transform/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def transform_gene_info(

# these are the interesting columns of the druggability dataset
useful_columns = [
"geneid",
"ensembl_gene_id",
"sm_druggability_bucket",
"safety_bucket",
"abability_bucket",
Expand All @@ -60,17 +60,16 @@ def transform_gene_info(
druggability = druggability[useful_columns]

target_list = nest_fields(
df=target_list, grouping="ensembl_gene_id", new_column="nominated_target"
df=target_list, grouping="ensembl_gene_id", new_column="target_nominations", drop_columns=["ensembl_gene_id"]
)

median_expression = nest_fields(
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression"
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression", drop_columns=["ensembl_gene_id"]
)

druggability = nest_fields(
df=druggability, grouping="geneid", new_column="druggability"
df=druggability, grouping="ensembl_gene_id", new_column="druggability", drop_columns=["ensembl_gene_id"]
)
druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True)

biodomains = (
biodomains.groupby("ensembl_gene_id")["biodomain"]
Expand Down Expand Up @@ -124,7 +123,7 @@ def transform_gene_info(
gene_info.fillna(
{
"is_igap": False,
"has_eqtl": False,
"is_eqtl": False,
"adj_p_val": -1,
"cor_pval": -1,
"is_adi": False,
Expand All @@ -142,19 +141,19 @@ def transform_gene_info(
)

gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1
gene_info["rna_in_ad_brain_change"] = (
gene_info["is_any_rna_changed_in_ad_brain"] = (
gene_info["adj_p_val"] <= adjusted_p_value_threshold
) & gene_info["rna_brain_change_studied"]

gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1
gene_info["protein_in_ad_brain_change"] = (
gene_info["is_any_protein_changed_in_ad_brain"] = (
gene_info["cor_pval"] <= protein_level_threshold
) & gene_info["protein_brain_change_studied"]

# create 'nominations' field
gene_info["nominations"] = gene_info.apply(
lambda row: len(row["nominated_target"])
if isinstance(row["nominated_target"], list)
# create 'total_nominations' field
gene_info["total_nominations"] = gene_info.apply(
lambda row: len(row["target_nominations"])
if isinstance(row["target_nominations"], list)
else np.NaN,
axis=1,
)
Expand All @@ -168,15 +167,15 @@ def transform_gene_info(
"symbol",
"alias",
"is_igap",
"has_eqtl",
"rna_in_ad_brain_change",
"is_eqtl",
"is_any_rna_changed_in_ad_brain",
"rna_brain_change_studied",
"protein_in_ad_brain_change",
"is_any_protein_changed_in_ad_brain",
"protein_brain_change_studied",
"nominated_target",
"target_nominations",
"median_expression",
"druggability",
"nominations",
"total_nominations",
"biodomains",
"is_adi",
"is_tep",
Expand Down
14 changes: 8 additions & 6 deletions test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@
column_rename:
ensg: ensembl_gene_id
ensembl_id: ensembl_gene_id
geneid: ensembl_gene_id
has_eqtl: is_eqtl
minimumlogcpm: min
quartile1logcpm: first_quartile
medianlogcpm: median
meanlogcpm: mean
quartile3logcpm: third_quartile
maximumlogcpm: max
provenance:
- syn25953363.6
- syn12514826.4
Expand All @@ -145,13 +153,7 @@
- syn44151254.1
- syn51942280.2
agora_rename:
has_eqtl: haseqtl
is_igap: isIGAP
symbol: hgnc_symbol
protein_in_ad_brain_change: isAnyProteinChangedInADBrain
rna_in_ad_brain_change: isAnyRNAChangedInADBrain
median_expression: medianexpression
nominated_target: nominatedtarget
destination: *dest

- team_info:
Expand Down

0 comments on commit 40d75ef

Please sign in to comment.