Skip to content

Commit

Permalink
Implement pm1 vcep rules (#171)
Browse files Browse the repository at this point in the history
* Another 7 VCEPs

* Another 18 VCEPs

* Another 3 VCEPs

* fix the cli test

* delete failing integration test

* Unit tests

* Some reformatting

* additional unit test
  • Loading branch information
gromdimon authored Aug 24, 2024
1 parent 059ada5 commit f8a6a56
Show file tree
Hide file tree
Showing 69 changed files with 5,270 additions and 58 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ src/bench/tmp/*.csv
cache/
cache/**

# Prediction file
prediction.json

# DS_Store file
.DS_Store

Expand Down
121 changes: 119 additions & 2 deletions src/auto_acmg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Implementations of the PVS1 algorithm."""

from typing import Optional, Type, Union
from typing import Dict, Optional, Type, Union

from loguru import logger

Expand All @@ -19,7 +19,7 @@
from src.defs.auto_acmg import AutoACMGResult, CdsInfo, GenomicStrand
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, ParseError
from src.defs.genome_builds import GenomeRelease
from src.defs.mehari import ProteinPos, TxPos
from src.defs.mehari import CdsPos, ProteinPos, TxPos
from src.defs.seqvar import SeqVar, SeqVarResolver
from src.utils import SeqVarTranscriptsHelper
from src.vcep import (
Expand All @@ -29,6 +29,34 @@
CDH1Predictor,
CerebralCreatineDeficiencySyndromesPredictor,
CoagulationFactorDeficiencyPredictor,
CongenitalMyopathiesPredictor,
DICER1Predictor,
ENIGMAPredictor,
EpilepsySodiumChannelPredictor,
FamilialHypercholesterolemiaPredictor,
FBN1Predictor,
GlaucomaPredictor,
HBOPCPredictor,
HearingLossPredictor,
HHTPredictor,
InsightColorectalCancerPredictor,
LeberCongenitalAmaurosisPredictor,
LysosomalDiseasesPredictor,
MalignantHyperthermiaPredictor,
MitochondrialDiseasesPredictor,
MonogenicDiabetesPredictor,
MyeloidMalignancyPredictor,
PKUPredictor,
PlateletDisordersPredictor,
PTENPredictor,
PulmonaryHypertensionPredictor,
RASopathyPredictor,
RettAngelmanPredictor,
SCIDPredictor,
ThrombosisPredictor,
TP53Predictor,
VHLPredictor,
VonWillebrandDiseasePredictor,
)

#: Mapping of HGNC gene identifiers to predictor classes.
Expand All @@ -52,6 +80,92 @@
"HGNC:11055": CerebralCreatineDeficiencySyndromesPredictor, # SLC6A8
"HGNC:3546": CoagulationFactorDeficiencyPredictor, # F8
"HGNC:3551": CoagulationFactorDeficiencyPredictor, # F9
"HGNC:7720": CongenitalMyopathiesPredictor, # NEB
"HGNC:129": CongenitalMyopathiesPredictor, # ACTA1
"HGNC:2974": CongenitalMyopathiesPredictor, # DNM2
"HGNC:7448": CongenitalMyopathiesPredictor, # MTM1
"HGNC:10483": CongenitalMyopathiesPredictor, # RYR1
"HGNC:17098": DICER1Predictor, # DICER1
"HGNC:1100": ENIGMAPredictor, # BRCA1
"HGNC:1101": ENIGMAPredictor, # BRCA2
"HGNC:10585": EpilepsySodiumChannelPredictor, # SCN1A
"HGNC:10588": EpilepsySodiumChannelPredictor, # SCN2A
"HGNC:10590": EpilepsySodiumChannelPredictor, # SCN3A
"HGNC:10596": EpilepsySodiumChannelPredictor, # SCN8A
"HGNC:10586": EpilepsySodiumChannelPredictor, # SCN1B
"HGNC:6547": FamilialHypercholesterolemiaPredictor, # LDLR
"HGNC:3603": FBN1Predictor, # FBN1
"HGNC:7610": GlaucomaPredictor, # MYOC
"HGNC:13733": HearingLossPredictor, # CDH23
"HGNC:2180": HearingLossPredictor, # COCH
"HGNC:4284": HearingLossPredictor, # GJB2
"HGNC:6298": HearingLossPredictor, # KCNQ4
"HGNC:7605": HearingLossPredictor, # MYO6
"HGNC:7606": HearingLossPredictor, # MYO7A
"HGNC:8818": HearingLossPredictor, # SLC26A4
"HGNC:11720": HearingLossPredictor, # TECTA
"HGNC:12601": HearingLossPredictor, # USH2A
"HGNC:7594": HearingLossPredictor, # MYO15A
"HGNC:8515": HearingLossPredictor, # OTOF
"HGNC:795": HBOPCPredictor, # ATM
"HGNC:26144": HBOPCPredictor, # PALB2
"HGNC:175": HHTPredictor, # ACVRL1
"HGNC:3349": HHTPredictor, # ENG
"HGNC:583": InsightColorectalCancerPredictor, # APC
"HGNC:7127": InsightColorectalCancerPredictor, # MLH1
"HGNC:7325": InsightColorectalCancerPredictor, # MSH2
"HGNC:7329": InsightColorectalCancerPredictor, # MSH6
"HGNC:9122": InsightColorectalCancerPredictor, # PMS2
"HGNC:10294": LeberCongenitalAmaurosisPredictor, # RPE65
"HGNC:4065": LysosomalDiseasesPredictor, # GAA
"HGNC:10483": MalignantHyperthermiaPredictor, # GBA
"HGNC:23287": MitochondrialDiseasesPredictor, # ETHE1
"HGNC:8806": MitochondrialDiseasesPredictor, # PDHA1
"HGNC:9179": MitochondrialDiseasesPredictor, # POLG
"HGNC:16266": MitochondrialDiseasesPredictor, # SLC19A3
"HGNC:11621": MonogenicDiabetesPredictor, # HNF1A
"HGNC:5024": MonogenicDiabetesPredictor, # HNF4A
"HGNC:4195": MonogenicDiabetesPredictor, # GCK
"HGNC:10471": MyeloidMalignancyPredictor, # RUNX1
"HGNC:8582": PKUPredictor, # PAH
"HGNC:6138": PlateletDisordersPredictor, # ITGA2B
"HGNC:6156": PlateletDisordersPredictor, # ITGB3
"HGNC:9588": PTENPredictor, # PTEN
"HGNC:1078": PulmonaryHypertensionPredictor, # BMPR2
"HGNC:12726": VonWillebrandDiseasePredictor, # VWF
"HGNC:775": ThrombosisPredictor, # SERPINC1
"HGNC:11998": TP53Predictor, # TP53
"HGNC:12687": VHLPredictor, # VHL
"HGNC:15454": RASopathyPredictor, # SHOC2
"HGNC:7989": RASopathyPredictor, # NRAS
"HGNC:9829": RASopathyPredictor, # RAF1
"HGNC:11187": RASopathyPredictor, # SOS1
"HGNC:11188": RASopathyPredictor, # SOS2
"HGNC:9644": RASopathyPredictor, # PTPN11
"HGNC:6407": RASopathyPredictor, # KRAS
"HGNC:6840": RASopathyPredictor, # MAP2K1
"HGNC:5173": RASopathyPredictor, # HRAS
"HGNC:10023": RASopathyPredictor, # RIT1
"HGNC:6842": RASopathyPredictor, # MAP2K2
"HGNC:1097": RASopathyPredictor, # BRAF
"HGNC:7227": RASopathyPredictor, # MRAS
"HGNC:6742": RASopathyPredictor, # LZTR1
"HGNC:17271": RASopathyPredictor, # RRAS2
"HGNC:9282": RASopathyPredictor, # PPP1CB
"HGNC:11634": RettAngelmanPredictor, # TCF4
"HGNC:11079": RettAngelmanPredictor, # SLC9A6
"HGNC:11411": RettAngelmanPredictor, # CDKL5
"HGNC:3811": RettAngelmanPredictor, # FOXG1
"HGNC:6990": RettAngelmanPredictor, # MECP2
"HGNC:12496": RettAngelmanPredictor, # UBE3A
"HGNC:12765": SCIDPredictor, # FOXN1
"HGNC:186": SCIDPredictor, # ADA
"HGNC:17642": SCIDPredictor, # DCLRE1C
"HGNC:6024": SCIDPredictor, # IL7R
"HGNC:6193": SCIDPredictor, # JAK3
"HGNC:9831": SCIDPredictor, # RAG1
"HGNC:9832": SCIDPredictor, # RAG2
"HGNC:6010": SCIDPredictor, # IL2RG
}


Expand Down Expand Up @@ -183,6 +297,9 @@ def parse_data(self, seqvar: SeqVar) -> AutoACMGResult:
self.result.data.tx_pos_utr = (
seqvar_transcript.tx_pos.ord if isinstance(seqvar_transcript.tx_pos, TxPos) else -1
)
self.result.data.cds_pos = (
seqvar_transcript.cds_pos.ord if isinstance(seqvar_transcript.cds_pos, CdsPos) else 0
)
self.result.data.prot_pos = (
seqvar_transcript.protein_pos.ord
if isinstance(seqvar_transcript.protein_pos, ProteinPos)
Expand Down
3 changes: 2 additions & 1 deletion src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def classify(
raise InvalidGenomeBuild("Invalid genome release")

auto_acmg = AutoACMG(variant, genome_release_enum)
auto_acmg.predict()
prediction = auto_acmg.predict()
prediction.save_to_file() if prediction else logger.error("No prediction was made.")
except AutoAcmgBaseException as e:
logger.error("Error occurred: {}", e)

Expand Down
39 changes: 39 additions & 0 deletions src/criteria/auto_pm1.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
AutoACMGData,
AutoACMGPrediction,
AutoACMGStrength,
GenomicStrand,
)
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, InvalidAPIResposeError
from src.defs.genome_builds import GenomeRelease
Expand All @@ -30,6 +31,44 @@ def __init__(self):
#: comment_pm1 to store the prediction explanation.
self.comment_pm1: str = ""

@staticmethod
def _get_affected_exon(var_data: AutoACMGData, seqvar: SeqVar) -> int:
"""
Get the affected exon number for the variant.
Go through all exons and count them before the variant position.
Pay attention to the strand of the gene.
Args:
var_data: AutoACMGData object
seqvar: SeqVar object
Returns:
int: Affected exon number
"""
exon_number = 0
if var_data.strand == GenomicStrand.Plus:
for exon in var_data.exons:
if exon.altStartI >= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altEndI < seqvar.pos:
exon_number += 1
elif var_data.strand == GenomicStrand.Minus:
for exon in var_data.exons[::-1]:
if exon.altEndI <= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altStartI > seqvar.pos:
exon_number += 1
else:
raise AlgorithmError(f"Invalid strand for {var_data.hgnc_id}: {var_data.strand}")
return exon_number

def _count_vars(self, seqvar: SeqVar, start_pos: int, end_pos: int) -> Tuple[int, int]:
"""
Counts pathogenic and benign variants in the specified range.
Expand Down
1 change: 1 addition & 0 deletions src/defs/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ class AutoACMGData(AutoAcmgBaseModel):
transcript_id: str = ""
transcript_tags: List[str] = []
tx_pos_utr: int = -1
cds_pos: int = 0
prot_pos: int = -1
prot_length: int = -1
cds_info: Dict[str, CdsInfo] = {}
Expand Down
5 changes: 5 additions & 0 deletions src/defs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ class AutoAcmgBaseModel(BaseModel):
# class Config:
# use_enum_values = True
# arbitrary_types_allowed = True

def save_to_file(self, file_path: str = "prediction.json") -> None:
"""Save the model to a file."""
with open(file_path, "w") as file:
file.write(self.model_dump_json(indent=4))
28 changes: 28 additions & 0 deletions src/vcep/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,31 @@
CerebralCreatineDeficiencySyndromesPredictor,
)
from src.vcep.coagulation_factor_deficiency import CoagulationFactorDeficiencyPredictor
from src.vcep.congenital_myopathies import CongenitalMyopathiesPredictor
from src.vcep.dicer1 import DICER1Predictor
from src.vcep.enigma import ENIGMAPredictor
from src.vcep.epilepsy_sodium_channel import EpilepsySodiumChannelPredictor
from src.vcep.familial_hypercholesterolemia import FamilialHypercholesterolemiaPredictor
from src.vcep.fbn1 import FBN1Predictor
from src.vcep.glaucoma import GlaucomaPredictor
from src.vcep.hbopc import HBOPCPredictor
from src.vcep.hearing_loss import HearingLossPredictor
from src.vcep.hhtp import HHTPredictor
from src.vcep.insight_colorectal_cancer import InsightColorectalCancerPredictor
from src.vcep.leber_congenital_amaurosis import LeberCongenitalAmaurosisPredictor
from src.vcep.lysosomal_diseases import LysosomalDiseasesPredictor
from src.vcep.malignant_hyperthermia_susceptibility import MalignantHyperthermiaPredictor
from src.vcep.mitochondrial_diseases import MitochondrialDiseasesPredictor
from src.vcep.monogenic_diabetes import MonogenicDiabetesPredictor
from src.vcep.myeloid_malignancy import MyeloidMalignancyPredictor
from src.vcep.pku import PKUPredictor
from src.vcep.platelet_disorders import PlateletDisordersPredictor
from src.vcep.pten import PTENPredictor
from src.vcep.pulmonary_hypertension import PulmonaryHypertensionPredictor
from src.vcep.rasopathy import RASopathyPredictor
from src.vcep.rett_angelman import RettAngelmanPredictor
from src.vcep.scid import SCIDPredictor
from src.vcep.thrombosis import ThrombosisPredictor
from src.vcep.tp53 import TP53Predictor
from src.vcep.vhl import VHLPredictor
from src.vcep.von_willebrand_disease import VonWillebrandDiseasePredictor
4 changes: 2 additions & 2 deletions src/vcep/acadvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from src.defs.auto_acmg import AutoACMGCriteria, AutoACMGData, AutoACMGPrediction, AutoACMGStrength
from src.defs.seqvar import SeqVar

PM1_CLUSTER_REGIONS = [
PM1_CLUSTER = [
(214, 223), # Nucleotide and substrate binding
(249, 251), # Nucleotide and substrate binding
(460, 466), # Nucleotide and substrate binding
Expand All @@ -25,7 +25,7 @@ def predict_pm1(self, seqvar: SeqVar, var_data: AutoACMGData) -> AutoACMGCriteri
"""Override predict_pm1 to include VCEP-specific logic for ACADVL."""
logger.info("Predict PM1")
# Check if variant falls within critical regions
for start, end in PM1_CLUSTER_REGIONS:
for start, end in PM1_CLUSTER:
if start <= var_data.prot_pos <= end:
comment = (
f"Variant falls within a critical region for ACADVL between positions "
Expand Down
2 changes: 1 addition & 1 deletion src/vcep/cardiomyopathy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Predictor for Cardiomyopathy VCEP.
Included gene:
Included genes:
MYH7 (HGNC:7577),
MYBPC3 (HGNC:7551),
TNNI3 (HGNC:11947),
Expand Down
52 changes: 4 additions & 48 deletions src/vcep/coagulation_factor_deficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,7 @@
from loguru import logger

from src.criteria.default_predictor import DefaultPredictor
from src.defs.auto_acmg import (
AutoACMGCriteria,
AutoACMGData,
AutoACMGPrediction,
AutoACMGStrength,
GenomicStrand,
)
from src.defs.auto_acmg import AutoACMGCriteria, AutoACMGData, AutoACMGPrediction, AutoACMGStrength
from src.defs.exceptions import AlgorithmError
from src.defs.seqvar import SeqVar

Expand All @@ -39,7 +33,7 @@
},
"moderate": {
"residues": [(1667, 1667), (1332, 1332)], # Residues affecting secretion
"regions": [(2267, 2304)], # FXa-binding residues, excluding Ser2283
"domains": [(2267, 2304)], # FXa-binding residues, excluding Ser2283
"excluded_residues": [(2283, 2283)], # Excluded residue in FXa-binding region
},
},
Expand All @@ -60,44 +54,6 @@

class CoagulationFactorDeficiencyPredictor(DefaultPredictor):

@staticmethod
def _get_affected_exon(var_data: AutoACMGData, seqvar: SeqVar) -> int:
"""
Get the affected exon number for the variant.
Go through all exons and count them before the variant position.
Pay attention to the strand of the gene.
Args:
var_data: AutoACMGData object
seqvar: SeqVar object
Returns:
int: Affected exon number
"""
exon_number = 0
if var_data.strand == GenomicStrand.Plus:
for exon in var_data.exons:
if exon.altStartI >= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altEndI < seqvar.pos:
exon_number += 1
elif var_data.strand == GenomicStrand.Minus:
for exon in var_data.exons[::-1]:
if exon.altEndI <= seqvar.pos:
return exon_number
if exon.altStartI <= seqvar.pos <= exon.altEndI:
exon_number += 1
return exon_number
if exon.altStartI > seqvar.pos:
exon_number += 1
else:
raise AlgorithmError(f"Invalid strand for {var_data.hgnc_id}: {var_data.strand}")
return exon_number

def predict_pm1(self, seqvar: SeqVar, var_data: AutoACMGData) -> AutoACMGCriteria:
"""
Override predict_pm1 to include VCEP-specific logic for Coagulation Factor Deficiency.
Expand Down Expand Up @@ -139,8 +95,8 @@ def predict_pm1(self, seqvar: SeqVar, var_data: AutoACMGData) -> AutoACMGCriteri
summary=comment,
)

# Check moderate level criteria for regions, excluding specific residues
for start, end in gene_cluster.get("moderate", {}).get("regions", []):
# Check moderate level criteria for domains, excluding specific residues
for start, end in gene_cluster.get("moderate", {}).get("domains", []):
if start <= var_data.prot_pos <= end:
excluded_residues = gene_cluster.get("moderate", {}).get("excluded_residues", [])
if not any(start <= var_data.prot_pos <= end for start, end in excluded_residues):
Expand Down
Loading

0 comments on commit f8a6a56

Please sign in to comment.