Skip to content

Commit

Permalink
Finish AutoPM1 (#133)
Browse files Browse the repository at this point in the history
* uniprot domain data

* wip

* tests

* wip

* integration tests

* docs

* fix the test

* rewind the tests
  • Loading branch information
gromdimon authored Jun 27, 2024
1 parent 45c89e3 commit c6b4933
Show file tree
Hide file tree
Showing 19 changed files with 313 additions and 30 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ tests/assets/**.json filter=lfs diff=lfs merge=lfs -text
tests/assets/**/*.json filter=lfs diff=lfs merge=lfs -text
lib/rmsk/**/*.gz filter=lfs diff=lfs merge=lfs -text
lib/rmsk/**/*.tbi filter=lfs diff=lfs merge=lfs -text
lib/uniprot/**/*.gz filter=lfs diff=lfs merge=lfs -text
lib/uniprot/**/*.tbi filter=lfs diff=lfs merge=lfs -text
tests/e2e/cassettes/**/* filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"type": "debugpy",
"request": "launch",
"module": "src.cli",
"args": ["NM_000277.2(PAH):c.707-7A>T", "--genome-release", "grch37"],
"args": ["chr1:1234:A:G", "--genome-release", "grch37"],
"console": "integratedTerminal"
}
]
Expand Down
Empty file added lib/uniprot/__init__.py
Empty file.
Empty file added lib/uniprot/grch37/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions lib/uniprot/grch37/uniprot.bed.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions lib/uniprot/grch37/uniprot.bed.gz.tbi
Git LFS file not shown
Empty file added lib/uniprot/grch38/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions lib/uniprot/grch38/uniprot.bed.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions lib/uniprot/grch38/uniprot.bed.gz.tbi
Git LFS file not shown
27 changes: 18 additions & 9 deletions src/criteria/auto_ba1_bs1_bs2_pm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,24 @@ def __init__(
#: Comment to store the prediction explanation.
self.comment: str = ""

def _get_control_af(self, variant_data: VariantResult) -> AlleleCount:
def _get_control_af(self, variant_data: VariantResult) -> Optional[AlleleCount]:
"""
Get the allele frequency for the control population.
Get the allele frequency information for the control population.
Args:
variant_data: The variant data.
Returns:
The allele frequency for the control population.
The allele frequency for the control population. None if no data found.
"""
if not variant_data.gnomad_exomes or not variant_data.gnomad_exomes.alleleCounts:
raise MissingDataError("No allele counts found in variant data")
for af in variant_data.gnomad_exomes.alleleCounts:
if af.cohort == "controls":
return af
raise MissingDataError("No controls allele counts found in variant data.")
return None

def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> Optional[float]:
"""
Get the allele frequency for the sequence variant.
Expand All @@ -66,7 +66,7 @@ def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
variant_data: The variant data.
Returns:
The allele frequency.
The allele frequency. None if no controls data
"""
if seqvar.chrom.startswith("M"):
if not variant_data.gnomad_mtdna or not variant_data.gnomad_mtdna.afHet:
Expand All @@ -80,6 +80,9 @@ def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
return variant_data.gnomad_mtdna.afHet
else:
controls_af = self._get_control_af(variant_data)
if not controls_af:
self.comment += "No controls allele frequency data found."
return None
if (
not controls_af.bySex
or not controls_af.bySex.overall
Expand Down Expand Up @@ -189,7 +192,7 @@ def _check_zygosity(self, seqvar: SeqVar, variant_data: VariantResult) -> bool:
allele_condition = self._get_allele_condition(seqvar)
self.comment += f"Allele condition: {allele_condition.name}.\n"
controls_af = self._get_control_af(variant_data)
if not controls_af.bySex:
if not controls_af or not controls_af.bySex:
self.comment += "No controls allele data found in control data.\n"
raise MissingDataError("No raw data found in control data.")

Expand Down Expand Up @@ -290,15 +293,21 @@ def predict(self) -> Tuple[Optional[BA1BS1BS2PM2], str]:

self.comment += "Check allele frequency for the control population.\n"
af = self._get_af(self.seqvar, self.variant_info)
if af > 0.05:
if not af:
self.comment += "No controls: PM2 is met."
self.prediction.PM2 = True
elif af > 0.05:
self.comment += "Allele frequency > 5%: BA1 is met."
self.prediction.BA1 = True
elif af >= 0.01:
self.comment += "Allele frequency > 1%: BS1 is met."
self.prediction.BS1 = True
else:
self.comment += "Allele frequency <= 1%: PM2 is met."
self.prediction.PM2 = True

self.comment += "Check zygosity.\n"
if af >= 0.01 and self._check_zygosity(self.seqvar, self.variant_info):
if af and af >= 0.01 and self._check_zygosity(self.seqvar, self.variant_info):
self.prediction.BS2 = True

except AutoAcmgBaseException as e:
Expand Down
33 changes: 27 additions & 6 deletions src/criteria/auto_pm1.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Implementation of PM1 criteria."""

import os
from typing import Optional, Tuple

import tabix # type: ignore
from loguru import logger

from src.api.annonars import AnnonarsClient
from src.core.config import Config
from src.core.config import Config, settings
from src.defs.annonars_variant import VariantResult
from src.defs.auto_acmg import PM1
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, InvalidAPIResposeError
Expand Down Expand Up @@ -82,10 +84,29 @@ def _count_pathogenic_variants(
logger.error("Failed to get variant from range. No ClinVar data.")
raise InvalidAPIResposeError("Failed to get variant from range. No ClinVar data.")

def _get_uniprot_domain(self, variant_info: VariantResult) -> Optional[Tuple[int, int]]:
def _get_uniprot_domain(self, seqvar: SeqVar) -> Optional[Tuple[int, int]]:
"""Check if the variant is in a UniProt domain."""
# TODO: Implement this method
return None
self.comment += "Check if the variant is in a UniProt domain.\n"
try:
# Find path to the lib file
if seqvar.genome_release == GenomeRelease.GRCh37:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz"
)
else:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz"
)
tb = tabix.open(path)
records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos)
# Return the first record
for record in records:
return int(record[1]), int(record[2])
return None

except tabix.TabixError as e:
logger.error("Failed to check if the variant is in a UniProt domain. Error: {}", e)
raise AlgorithmError("Failed to check if the variant is in a UniProt domain.") from e

def predict(self) -> Tuple[Optional[PM1], str]:
"""Predict PM1 criteria."""
Expand Down Expand Up @@ -122,7 +143,7 @@ def predict(self) -> Tuple[Optional[PM1], str]:

self.comment += "Checking if the variant is in a UniProt domain. => \n"
logger.debug("Checking if the variant is in a UniProt domain.")
uniprot_domain = self._get_uniprot_domain(self.variant_info)
uniprot_domain = self._get_uniprot_domain(self.seqvar)
if not uniprot_domain:
self.comment += "The variant is not in a UniProt domain."
logger.debug("The variant is not in a UniProt domain.")
Expand All @@ -141,7 +162,7 @@ def predict(self) -> Tuple[Optional[PM1], str]:
pathogenic_count, _ = self._count_pathogenic_variants(self.seqvar, start_pos, end_pos)
if pathogenic_count >= 2:
self.comment += (
"Found 2 or more pathogenic variants in the UniProt domain. " "PM1 is met."
"Found 2 or more pathogenic variants in the UniProt domain. PM1 is met."
)
logger.debug(
"Found 2 or more pathogenic variants in the UniProt domain. PM1 is met."
Expand Down
2 changes: 1 addition & 1 deletion src/criteria/auto_pm4_bp3.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _in_repeat_region(self, seqvar: SeqVar) -> bool:
"""
self.comment += "Check if the variant is in a repeat region.\n"
try:
# Find path to the lib/hg19_repeatmasker.bed.gz file
# Find path to the lib file
if seqvar.genome_release == GenomeRelease.GRCh37:
path = os.path.join(settings.PATH_TO_ROOT, "lib", "rmsk", "grch37", "rmsk.bed.gz")
else:
Expand Down
14 changes: 7 additions & 7 deletions src/defs/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ class MissenseScore(BaseModel):


MissenseScores: List[MissenseScore] = [
MissenseScore(name="BayesDel_noAF_score", benign_threshold=-0.36, pathogenic_threshold=0.27),
MissenseScore(name="REVEL_score", benign_threshold=0.183, pathogenic_threshold=0.773),
MissenseScore(name="CADD_raw", benign_threshold=0.15, pathogenic_threshold=28.1),
MissenseScore(name="PrimateAI_score", benign_threshold=0.362, pathogenic_threshold=0.867),
MissenseScore(name="BayesDel_noAF_score", benign_threshold=-0.476, pathogenic_threshold=0.521),
MissenseScore(name="REVEL_score", benign_threshold=0.133, pathogenic_threshold=0.946),
MissenseScore(name="CADD_raw", benign_threshold=16.1, pathogenic_threshold=33),
MissenseScore(name="PrimateAI_score", benign_threshold=0.362, pathogenic_threshold=0.895),
# MissenseScore(name="FATHMM", benign_threshold=4.69, pathogenic_threshold=-5.04), # is <=
# MissenseScore(name="MutPred2", benign_threshold=0.197, pathogenic_threshold=0.829), # We don't have the right score for this
MissenseScore(name="Polyphen2_HVAR_score", benign_threshold=0.009, pathogenic_threshold=0.999),
MissenseScore(name="Polyphen2_HVAR_score", benign_threshold=0.001, pathogenic_threshold=1),
# MissenseScore(name="SIFT_score", benign_threshold=0.327, pathogenic_threshold=0.001), # is <=
MissenseScore(name="VEST4_score", benign_threshold=0.302, pathogenic_threshold=0.861),
# MissenseScore(name="VEST4_score", benign_threshold=0.302, pathogenic_threshold=0.861),
MissenseScore(
name="phyloP100way_vertebrate", benign_threshold=0.021, pathogenic_threshold=9.741
name="phyloP100way_vertebrate", benign_threshold=-1.04, pathogenic_threshold=9.88
), # Not sure about 100/470/17 way
]

Expand Down
2 changes: 1 addition & 1 deletion tests/criteria/test_auto_pp3_bp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_is_pathogenic_score_missing_data(file_path, seqvar, variant_info):
@pytest.mark.parametrize(
"file_path, expected",
[
("annonars/PAH_variant.json", False),
("annonars/PAH_variant.json", True),
],
)
def test_is_benign_score(file_path, expected, seqvar, variant_info):
Expand Down
45 changes: 43 additions & 2 deletions tests/integ/test_integ_ba1_bs1_bs2_pm2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Integration tests for `BP7` criteria using upstream server."""
"""Integration tests for `BA1`, `BS1`, `BS2`, `PM2` criteria using upstream server."""

from typing import Tuple

Expand Down Expand Up @@ -26,7 +26,48 @@
("NM_000277.2(PAH):c.503delA", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.814G>T", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.1162G>A", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.722G>A", GenomeRelease.GRCh37, [False, False, False, True]),
# ("NM_000277.1(PAH):c.722G>A", GenomeRelease.GRCh37, [False, False, False, True]),
# ("NM_000277.2(PAH):c.735G>A", GenomeRelease.GRCh37, (True, False, True, False)),
# ("NM_000314.6(PTEN):c.-1311T>C", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000314.6(PTEN):c.-903G>A", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000314.6(PTEN):c.-9C>G", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_206933.2(USH2A):c.15562A>G", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_004999.3(MYO6):c.475G>A", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000260.3(MYO7A):c.324C>T", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000257.3(MYH7):c.3036C>T", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_005633.3(SOS1):c.1018C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_002880.3(RAF1):c.94A>G", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001754.4(RUNX1):c.423G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.704A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.935C>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.869G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_001754.4(RUNX1):c.179C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_001754.4(RUNX1):c.205G>C", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000540.2(RYR1):c.12884C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000540.3(RYR1):c.6961A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000284.4(PDHA1):c.999A>C", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_004992.3(MECP2):c.1140G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_004992.3(MECP2):c.1030C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000527.4(LDLR):c.1323C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000277.3(PAH):c.399T>C", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1375G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1199C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1196C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.915C>G", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000051.3(ATM):c.1073A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000051.4(ATM):c.1066-6T>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000051.4(ATM):c.3925G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NC_012920.1:m.8557G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000545.8(HNF1A):c.1849G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_004086.2(COCH):c.151C>T", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_004700.3(KCNQ4):c.853G>A", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_206933.2(USH2A):c.4510dupA", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000441.1(SLC26A4):c.365dupT", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.350delC", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.58C>T", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.498C>A", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.521T>C", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.504C>A", GenomeRelease.GRCh37, (False, False, False, True)),
],
)
def test_ba1_bs1_bs2_pm2(
Expand Down
57 changes: 57 additions & 0 deletions tests/integ/test_integ_pm1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Integration tests for `PM1` criteria using upstream server."""

import pytest

from src.auto_acmg import AutoACMG
from src.core.config import Config
from src.criteria.auto_criteria import AutoACMGCriteria
from src.criteria.auto_pm1 import AutoPM1
from src.defs.annonars_variant import AnnonarsVariantResponse
from src.defs.genome_builds import GenomeRelease
from src.defs.seqvar import SeqVar


@pytest.mark.remote
@pytest.mark.parametrize(
"variant_name, genome_release, expected_prediction",
[
# ("NM_000540.3(RYR1):c.1589G>A", GenomeRelease.GRCh37, True),
("NM_004700.3(KCNQ4):c.825G>C", GenomeRelease.GRCh37, True),
("NM_000257.3(MYH7):c.1157A>G", GenomeRelease.GRCh37, True),
("NM_005343.3(HRAS):c.175G>A", GenomeRelease.GRCh37, True),
("NM_030662.3(MAP2K2):c.400T>C", GenomeRelease.GRCh37, True),
("NM_004333.4(BRAF):c.739T>G", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.316T>A", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.314A>C", GenomeRelease.GRCh37, True),
("NM_001754.4:c.315C>A", GenomeRelease.GRCh37, True),
("NM_002834.4(PTPN11):c.782T>A", GenomeRelease.GRCh37, True),
("NM_004333.6(BRAF):c.793G>C", GenomeRelease.GRCh37, True),
("NM_005633.3(SOS1):c.1276C>A", GenomeRelease.GRCh37, True),
("NM_000546.5(TP53):c.396G>C", GenomeRelease.GRCh37, True),
("NM_005343.4(HRAS):c.175_176delinsCT", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.485G>A", GenomeRelease.GRCh37, True),
],
)
def test_pm1(
variant_name: str,
genome_release: GenomeRelease,
expected_prediction: bool,
config: Config,
):
# First, resolve variant
auto_acmg = AutoACMG(variant_name, genome_release, config=config)
seqvar = auto_acmg.resolve_variant()
assert isinstance(seqvar, SeqVar)
# Then, fetch the variant_info from Annonars
auto_acmg_criteria = AutoACMGCriteria(seqvar, config=config)
variant_info = auto_acmg_criteria._get_variant_info(seqvar)
assert isinstance(variant_info, AnnonarsVariantResponse)
# Then, predict PM1
auto_pm1 = AutoPM1(seqvar, variant_info.result, config=config)
prediction, details = auto_pm1.predict()
print(details)
if expected_prediction is None:
assert prediction is None, f"Failed for {variant_name}"
else:
assert prediction
assert prediction.PM1 == expected_prediction, f"Failed for {variant_name}"
Loading

0 comments on commit c6b4933

Please sign in to comment.