diff --git a/.vscode/launch.json b/.vscode/launch.json index a993065..cc012f7 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,7 +6,7 @@ "type": "debugpy", "request": "launch", "module": "src.cli", - "args": ["NM_000152.4:c.1A>G", "--genome-release", "grch38"], + "args": ["NM_000277.2(PAH):c.1278T>C", "--genome-release", "grch38"], "console": "integratedTerminal" } ] diff --git a/example_variants.txt b/example_variants.txt index 9eeddaf..7371c23 100644 --- a/example_variants.txt +++ b/example_variants.txt @@ -135,7 +135,13 @@ NM_000546.5:c.-28-5118_672+241dup (Exon 2-6 duplication) (Gene TP53) (From R - provem NOT in tandem: +PS1: +NM_004333.4(BRAF):c.739T>C (p.Phe247Leu) (Gene: BRAF) (Note: PS1) (From ClinGen) + PM4: NC_000011.10:2847957:C:G === NM_000218.2:c.1986C>G (p.Tyr662Ter) (Gene: KCNQ1) (Note: PM4) (From Recommendation 07.09.2018) NM_000314.6:c.1134_1135insCTGG (p.Tyr379LeufsTer4) (Gene: PTEN) (Note: PM4) (From Recommendation 07.09.2018) ### Debug NM_000152.4:c.2842_2483insT (p.Leu948SerfsTer70) (Gene: GAA) (Note: PM4) (From Recommendation 07.09.2018) ### Debug + +BA1: +NM_000277.2(PAH):c.1278T>C diff --git a/src/api/annonars.py b/src/api/annonars.py index b9786df..71be792 100644 --- a/src/api/annonars.py +++ b/src/api/annonars.py @@ -52,7 +52,7 @@ def get_variant_from_range( logger.exception("Validation failed: {}", e) raise AnnonarsException("Annonars returned non-validating data.") from e - def get_variant_info(self, seqvar: SeqVar) -> Optional[AnnonarsVariantResponse]: + def get_variant_info(self, seqvar: SeqVar) -> AnnonarsVariantResponse: """Get variant information from Annonars. Args: diff --git a/src/atuo_ba1_bs1_bs_2_pm2.py b/src/atuo_ba1_bs1_bs_2_pm2.py new file mode 100644 index 0000000..101fa55 --- /dev/null +++ b/src/atuo_ba1_bs1_bs_2_pm2.py @@ -0,0 +1,172 @@ +"""Implementation of BA1, BS1, BS2, PM2 prediction for sequence variants.""" + +import re +from typing import Optional + +from loguru import logger + +from src.api.annonars import AnnonarsClient +from src.core.config import Config +from src.defs.annonars_variant import AnnonarsVariantResponse, VariantResult +from src.defs.auto_acmg import BA1BS1BS2PM2 +from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, MissingDataError +from src.defs.genome_builds import GenomeRelease +from src.defs.seqvar import SeqVar + + +class AutoBA1BS1BS2PM2: + """Predicts BA1, BS1, BS2, PM2 criteria for sequence variants.""" + + def __init__( + self, seqvar: SeqVar, genome_release: GenomeRelease, *, config: Optional[Config] = None + ): + #: Configuration to use. + self.config = config or Config() + # Attributes to be set + self.seqvar = seqvar + self.genome_release = genome_release + self.annonars_client = AnnonarsClient(api_base_url=self.config.api_base_url_annonars) + self.prediction: BA1BS1BS2PM2 | None = None + + def _get_variant_info(self, seqvar: SeqVar) -> Optional[AnnonarsVariantResponse]: + """Get variant information from Annonars. + + Returns: + dict: Annonars response. + """ + try: + logger.debug("Getting variant information for {}.", seqvar) + return self.annonars_client.get_variant_info(seqvar) + except AutoAcmgBaseException as e: + logger.error("Failed to get variant information. Error: {}", e) + return None + + def evaluate_ba1(self, seqvar: SeqVar, variant_data: VariantResult) -> bool: + if ( + not variant_data.gnomad_exomes + or not variant_data.gnomad_exomes.alleleCounts + or not variant_data.gnomad_exomes.alleleCounts[0].raw + or not variant_data.gnomad_exomes.alleleCounts[0].raw.af + ): + raise MissingDataError("No allele counts found in variant data") + + chrom = seqvar.chrom + allele_freq = variant_data.gnomad_exomes.alleleCounts[0].raw.af + + if chrom.startswith("chrM"): + return allele_freq > 0.01 + else: + return allele_freq > 0.05 + + def evaluate_bs1(self, seqvar: SeqVar, variant_data: VariantResult, max_credible_freq) -> bool: + if ( + not variant_data.gnomad_exomes + or not variant_data.gnomad_exomes.alleleCounts + or not variant_data.gnomad_exomes.alleleCounts[0].byAncestryGroup + or not variant_data.gnomad_exomes.alleleCounts[0].raw + or not variant_data.gnomad_exomes.alleleCounts[0].raw.af + ): + raise MissingDataError("No allele counts found in variant data") + chrom = seqvar.chrom + faf = 0.0 + for group in variant_data.gnomad_exomes.alleleCounts[0].byAncestryGroup: + if group.faf95 and group.faf95 > faf: + faf = group.faf95 + + if chrom.startswith("chrM"): + return variant_data.gnomad_exomes.alleleCounts[0].raw.af > 0.005 + else: + return faf > max_credible_freq + + def evaluate_bs2(self, variant_data: VariantResult) -> bool: + if ( + not variant_data.gnomad_exomes + or not variant_data.gnomad_exomes.alleleCounts + or not variant_data.gnomad_exomes.alleleCounts[0].raw + or not variant_data.gnomad_exomes.alleleCounts[0].raw.ac + or not variant_data.gnomad_exomes.alleleCounts[0].raw.nhomalt + ): + raise MissingDataError("No allele counts found in variant data") + + gene_mode = "inplace" # Field to identify recessive/dominant/X-linked + allele_count = variant_data.gnomad_exomes.alleleCounts[0].raw.ac + + if gene_mode in ["recessive", "X-linked"]: + return allele_count > 2 + elif gene_mode == "dominant": + return allele_count > 5 + return False + + def evaluate_pm2(self, seqvar: SeqVar, variant_data: VariantResult) -> bool: + if ( + not variant_data.gnomad_exomes + or not variant_data.gnomad_exomes.alleleCounts + or not variant_data.gnomad_exomes.alleleCounts[0].raw + or not variant_data.gnomad_exomes.alleleCounts[0].raw.af + or not variant_data.gnomad_exomes.alleleCounts[0].raw.ac + or not variant_data.gnomad_exomes.alleleCounts[0].raw.nhomalt + ): + raise MissingDataError("No allele counts found in variant data") + + chrom = seqvar.chrom + allele_freq = variant_data.gnomad_exomes.alleleCounts[0].raw.af + gene_mode = "inplace" # Field to identify recessive/dominant/X-linked + + if chrom.startswith("chrM"): + return allele_freq < 0.00002 + else: + if gene_mode in ["recessive", "X-linked"]: + allele_count = variant_data.gnomad_exomes.alleleCounts[0].raw.ac + if allele_count <= 4: + return True + elif gene_mode == "dominant": + homozygous_count = variant_data.gnomad_exomes.alleleCounts[0].raw.nhomalt + if homozygous_count <= 1: + return True + return allele_freq < 0.0001 + + def predict(self) -> Optional[BA1BS1BS2PM2]: + """ + Predicts the BA1, BS1, BS2, PM5 criteria for the sequence variant. + + + Note: + Rules: + BA1: Allele frequency is >5% in Exome Sequencing Project, 1000 Genomes Project, or Exome + Aggregation Consortium. + + BS1: Allele frequency greater than expected for disorder. + + BS2: Observed in a healthy adult individual for a recessive (homozygous), dominant + (heterozygous), or X-linked (hemizygous) disorder, with full penetrance expected at an + early age. + + + PM2: Absent from controls (or at extremely low frequency if recessive) in Exome + Sequencing Project, 1000 Genomes or ExAC. + + Returns: + BA1BS1BS2PM2: The prediction result. + """ + try: + # Get variant information + variant_info = self._get_variant_info(self.seqvar) + if not variant_info: + raise MissingDataError("No variant information retrieved") + + # Instantiate the prediction result + self.prediction = BA1BS1BS2PM2() + + # Evaluate each criterion + self.prediction.BA1 = self.evaluate_ba1(self.seqvar, variant_info.result) + self.prediction.BS1 = self.evaluate_bs1( + self.seqvar, variant_info.result, max_credible_freq=0.01 + ) # Adjust max_credible_freq as needed + self.prediction.BS2 = self.evaluate_bs2(variant_info.result) + self.prediction.PM2 = self.evaluate_pm2(self.seqvar, variant_info.result) + except AutoAcmgBaseException as e: + logger.error("Error occurred during BA1, BS1, BS2, PM5 prediction. Error: {}", e) + self.prediction = None + + # Return the prediction result + return self.prediction diff --git a/src/atuo_ba1_bs1_bs_2_ps4_pm2.py b/src/atuo_ba1_bs1_bs_2_ps4_pm2.py deleted file mode 100644 index a286100..0000000 --- a/src/atuo_ba1_bs1_bs_2_ps4_pm2.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Implementation of BA1, BS1, BS2, PS4, PM2 prediction for sequence variants.""" - -import re -from typing import Optional - -from loguru import logger - -from src.api.annonars import AnnonarsClient -from src.core.config import Config -from src.defs.annonars_variant import AnnonarsVariantResponse, VariantResult -from src.defs.auto_acmg import BA1BS1BS2PS4PM2 -from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, MissingDataError -from src.defs.genome_builds import GenomeRelease -from src.defs.seqvar import SeqVar - - -class AutoBA1BS1BS2PS4PM2: - """Predicts BA1, BS1, BS2, PS4, PM2 criteria for sequence variants.""" - - def __init__( - self, seqvar: SeqVar, genome_release: GenomeRelease, *, config: Optional[Config] = None - ): - #: Configuration to use. - self.config = config or Config() - # Attributes to be set - self.seqvar = seqvar - self.genome_release = genome_release - self.annonars_client = AnnonarsClient(api_base_url=self.config.api_base_url_annonars) - self.prediction: BA1BS1BS2PS4PM2 | None = None - - def _get_variant_info(self, seqvar: SeqVar) -> Optional[AnnonarsVariantResponse]: - """Get variant information from Annonars. - - Returns: - dict: Annonars response. - """ - try: - logger.debug("Getting variant information for {}.", seqvar) - return self.annonars_client.get_variant_info(seqvar) - except AutoAcmgBaseException as e: - logger.error("Failed to get variant information. Error: {}", e) - return None - - def predict(self) -> Optional[BA1BS1BS2PS4PM2]: - """ - Predicts the BA1, BS1, BS2, PS4, PM5 criteria for the sequence variant. - - Returns: - BA1BS1BS2PS4PM2: The prediction result. - """ - - # Return the prediction result - return self.prediction diff --git a/src/auto_acmg.py b/src/auto_acmg.py index 6a1b6ee..808ea0c 100644 --- a/src/auto_acmg.py +++ b/src/auto_acmg.py @@ -4,6 +4,7 @@ from loguru import logger +from src.atuo_ba1_bs1_bs_2_pm2 import AutoBA1BS1BS2PM2 from src.auto_ps1_pm5 import AutoPS1PM5 from src.core.config import Config from src.defs.auto_pvs1 import ( @@ -160,6 +161,36 @@ def predict(self): except AutoAcmgBaseException as e: logger.error("Failed to predict PS1 and PM5 criteria. Error: {}", e) + # BA1, BS1, BS2, PM2 + try: + logger.info("Predicting BA1, BS1, BS2, and PM2.") + ba1bs1bs2pm2 = AutoBA1BS1BS2PM2( + self.seqvar, self.genome_release, config=self.config + ) + ba1bs1bs2pm2_prediction = ba1bs1bs2pm2.predict() + if not ba1bs1bs2pm2_prediction: + logger.error("Failed to predict BA1, BS1, BS2, and PM2 criteria.") + else: + self.seqvar_ba1, self.seqvar_bs1, self.seqvar_bs2, self.seqvar_pm2 = ( + ba1bs1bs2pm2_prediction.BA1, + ba1bs1bs2pm2_prediction.BS1, + ba1bs1bs2pm2_prediction.BS2, + ba1bs1bs2pm2_prediction.PM2, + ) + logger.info( + "BA1 prediction for {}: {}.\n" + "BS1 prediction: {}.\n" + "BS2 prediction: {}.\n" + "PM2 prediction: {}.", + self.seqvar.user_repr, + self.seqvar_ba1, + self.seqvar_bs1, + self.seqvar_bs2, + self.seqvar_pm2, + ) + except AutoAcmgBaseException as e: + logger.error("Failed to predict BA1, BS1, BS2, and PM2 criteria. Error: {}", e) + elif isinstance(variant, StrucVar): logger.info("Currently only PVS1 prediction is implemented for structural variants!") logger.info( diff --git a/src/auto_ps1_pm5.py b/src/auto_ps1_pm5.py index 71e7771..e678d56 100644 --- a/src/auto_ps1_pm5.py +++ b/src/auto_ps1_pm5.py @@ -58,6 +58,9 @@ def _parse_HGVSp(pHGVSp: str) -> Optional[AminoAcid]: AminoAcid: The new amino acid if the pHGVSp is valid, None otherwise. """ try: + # If multiple pHGVSp values are present, take the first one + if ";" in pHGVSp: + pHGVSp = pHGVSp.split(";")[0] match = re.match(REGEX_HGVSP, pHGVSp) if match: return AminoAcid[match.group(3)] @@ -77,28 +80,20 @@ def _is_pathogenic(variant_info: VariantResult) -> bool: Returns: bool: True if the variant is pathogenic """ - if ( - variant_info.clinvar - and variant_info.clinvar.referenceAssertions - and variant_info.clinvar.referenceAssertions[0].clinicalSignificance - ): - return ( - variant_info.clinvar.referenceAssertions[0].clinicalSignificance - == "CLINICAL_SIGNIFICANCE_PATHOGENIC" - ) + if variant_info.clinvar and variant_info.clinvar.referenceAssertions: + for refAssertion in variant_info.clinvar.referenceAssertions: + if refAssertion.clinicalSignificance and refAssertion.clinicalSignificance in [ + "CLINICAL_SIGNIFICANCE_PATHOGENIC", + "CLINICAL_SIGNIFICANCE_LIKELY_PATHOGENIC", + ]: + return True return False def predict(self) -> Optional[PS1PM5]: """ Predicts the criteria PS1 and PM5 for the provided sequence variant. - Note: - Rule: - PS1: Same amino acid change as a previously established pathogenic variant regardless of nucleotide change. - PM5: Novel missense change at an amino acid residue where a different missense change determined to be - pathogenic has been seen before. - - Implementation: + Implementation of the rule PS1 and PM5: The method implements the rule by: - Getting the primary variant information & parsing the primary amino acid change. - Iterating over all possible alternative bases & getting the alternative variant information. @@ -108,6 +103,12 @@ def predict(self) -> Optional[PS1PM5]: - If the alternative variant is pathogenic and the amino acid change is different from the primary variant, then PM5 is set to True. + Note: + Rules: + PS1: Same amino acid change as a previously established pathogenic variant regardless of nucleotide change. + PM5: Novel missense change at an amino acid residue where a different missense change determined to be + pathogenic has been seen before. + Returns: PS1PM5: The prediction result. """ diff --git a/src/defs/annonars_variant.py b/src/defs/annonars_variant.py index 514b3d4..4403b33 100644 --- a/src/defs/annonars_variant.py +++ b/src/defs/annonars_variant.py @@ -6,7 +6,7 @@ from typing import Any, List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel class VariantQuery(BaseModel): @@ -17,6 +17,10 @@ class VariantQuery(BaseModel): alternative: str +class GnomadExomes(BaseModel): + alleleCounts: List[AlleleCount] + + class Dbnsfp(BaseModel): HGVSc_ANNOVAR: Optional[str] = None HGVSp_ANNOVAR: Optional[str] = None @@ -122,7 +126,7 @@ class VariantResult(BaseModel): dbnsfp: Optional[Dbnsfp] = None dbscsnv: Optional[Any] = None gnomad_mtdna: Optional[Any] = None - gnomad_exomes: Optional[Any] = None + gnomad_exomes: Optional[GnomadExomes] = None gnomad_genomes: Optional[GnomadGenomes] = None helixmtdb: Optional[Any] = None ucsc_conservation: Optional[Any] = None diff --git a/src/defs/auto_acmg.py b/src/defs/auto_acmg.py index 488bdc3..de5c0d3 100644 --- a/src/defs/auto_acmg.py +++ b/src/defs/auto_acmg.py @@ -57,11 +57,10 @@ class PS1PM5(BaseModel): PM5: bool = False -class BA1BS1BS2PS4PM2(BaseModel): - """BA1, BS1, BS2, PS4, and PM2 criteria prediction.""" +class BA1BS1BS2PM2(BaseModel): + """BA1, BS1, BS2, and PM2 criteria prediction.""" BA1: bool = False BS1: bool = False BS2: bool = False - PS4: bool = False PM2: bool = False