Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finish AutoPM1 #133

Merged
merged 8 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ tests/assets/**.json filter=lfs diff=lfs merge=lfs -text
tests/assets/**/*.json filter=lfs diff=lfs merge=lfs -text
lib/rmsk/**/*.gz filter=lfs diff=lfs merge=lfs -text
lib/rmsk/**/*.tbi filter=lfs diff=lfs merge=lfs -text
lib/uniprot/**/*.gz filter=lfs diff=lfs merge=lfs -text
lib/uniprot/**/*.tbi filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"type": "debugpy",
"request": "launch",
"module": "src.cli",
"args": ["NM_000277.2(PAH):c.707-7A>T", "--genome-release", "grch37"],
"args": ["chr1:1234:A:G", "--genome-release", "grch37"],
"console": "integratedTerminal"
}
]
Expand Down
Empty file added lib/uniprot/__init__.py
Empty file.
Empty file added lib/uniprot/grch37/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions lib/uniprot/grch37/uniprot.bed.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions lib/uniprot/grch37/uniprot.bed.gz.tbi
Git LFS file not shown
Empty file added lib/uniprot/grch38/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions lib/uniprot/grch38/uniprot.bed.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions lib/uniprot/grch38/uniprot.bed.gz.tbi
Git LFS file not shown
27 changes: 18 additions & 9 deletions src/criteria/auto_ba1_bs1_bs2_pm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,24 @@ def __init__(
#: Comment to store the prediction explanation.
self.comment: str = ""

def _get_control_af(self, variant_data: VariantResult) -> AlleleCount:
def _get_control_af(self, variant_data: VariantResult) -> Optional[AlleleCount]:
"""
Get the allele frequency for the control population.
Get the allele frequency information for the control population.

Args:
variant_data: The variant data.

Returns:
The allele frequency for the control population.
The allele frequency for the control population. None if no data found.
"""
if not variant_data.gnomad_exomes or not variant_data.gnomad_exomes.alleleCounts:
raise MissingDataError("No allele counts found in variant data")
for af in variant_data.gnomad_exomes.alleleCounts:
if af.cohort == "controls":
return af
raise MissingDataError("No controls allele counts found in variant data.")
return None

def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> Optional[float]:
"""
Get the allele frequency for the sequence variant.

Expand All @@ -66,7 +66,7 @@ def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
variant_data: The variant data.

Returns:
The allele frequency.
The allele frequency. None if no controls data
"""
if seqvar.chrom.startswith("M"):
if not variant_data.gnomad_mtdna or not variant_data.gnomad_mtdna.afHet:
Expand All @@ -80,6 +80,9 @@ def _get_af(self, seqvar: SeqVar, variant_data: VariantResult) -> float:
return variant_data.gnomad_mtdna.afHet
else:
controls_af = self._get_control_af(variant_data)
if not controls_af:
self.comment += "No controls allele frequency data found."
return None
if (
not controls_af.bySex
or not controls_af.bySex.overall
Expand Down Expand Up @@ -189,7 +192,7 @@ def _check_zygosity(self, seqvar: SeqVar, variant_data: VariantResult) -> bool:
allele_condition = self._get_allele_condition(seqvar)
self.comment += f"Allele condition: {allele_condition.name}.\n"
controls_af = self._get_control_af(variant_data)
if not controls_af.bySex:
if not controls_af or not controls_af.bySex:
self.comment += "No controls allele data found in control data.\n"
raise MissingDataError("No raw data found in control data.")

Expand Down Expand Up @@ -290,15 +293,21 @@ def predict(self) -> Tuple[Optional[BA1BS1BS2PM2], str]:

self.comment += "Check allele frequency for the control population.\n"
af = self._get_af(self.seqvar, self.variant_info)
if af > 0.05:
if not af:
self.comment += "No controls: PM2 is met."
self.prediction.PM2 = True
elif af > 0.05:
self.comment += "Allele frequency > 5%: BA1 is met."
self.prediction.BA1 = True
elif af >= 0.01:
self.comment += "Allele frequency > 1%: BS1 is met."
self.prediction.BS1 = True
else:
self.comment += "Allele frequency <= 1%: PM2 is met."
self.prediction.PM2 = True

self.comment += "Check zygosity.\n"
if af >= 0.01 and self._check_zygosity(self.seqvar, self.variant_info):
if af and af >= 0.01 and self._check_zygosity(self.seqvar, self.variant_info):
self.prediction.BS2 = True

except AutoAcmgBaseException as e:
Expand Down
33 changes: 27 additions & 6 deletions src/criteria/auto_pm1.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Implementation of PM1 criteria."""

import os
from typing import Optional, Tuple

import tabix # type: ignore
from loguru import logger

from src.api.annonars import AnnonarsClient
from src.core.config import Config
from src.core.config import Config, settings
from src.defs.annonars_variant import VariantResult
from src.defs.auto_acmg import PM1
from src.defs.exceptions import AlgorithmError, AutoAcmgBaseException, InvalidAPIResposeError
Expand Down Expand Up @@ -82,10 +84,29 @@ def _count_pathogenic_variants(
logger.error("Failed to get variant from range. No ClinVar data.")
raise InvalidAPIResposeError("Failed to get variant from range. No ClinVar data.")

def _get_uniprot_domain(self, variant_info: VariantResult) -> Optional[Tuple[int, int]]:
def _get_uniprot_domain(self, seqvar: SeqVar) -> Optional[Tuple[int, int]]:
"""Check if the variant is in a UniProt domain."""
# TODO: Implement this method
return None
self.comment += "Check if the variant is in a UniProt domain.\n"
try:
# Find path to the lib file
if seqvar.genome_release == GenomeRelease.GRCh37:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz"
)
else:
path = os.path.join(
settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz"
)
tb = tabix.open(path)
records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos)
# Return the first record
for record in records:
return int(record[1]), int(record[2])
return None

except tabix.TabixError as e:
logger.error("Failed to check if the variant is in a UniProt domain. Error: {}", e)
raise AlgorithmError("Failed to check if the variant is in a UniProt domain.") from e

def predict(self) -> Tuple[Optional[PM1], str]:
"""Predict PM1 criteria."""
Expand Down Expand Up @@ -122,7 +143,7 @@ def predict(self) -> Tuple[Optional[PM1], str]:

self.comment += "Checking if the variant is in a UniProt domain. => \n"
logger.debug("Checking if the variant is in a UniProt domain.")
uniprot_domain = self._get_uniprot_domain(self.variant_info)
uniprot_domain = self._get_uniprot_domain(self.seqvar)
if not uniprot_domain:
self.comment += "The variant is not in a UniProt domain."
logger.debug("The variant is not in a UniProt domain.")
Expand All @@ -141,7 +162,7 @@ def predict(self) -> Tuple[Optional[PM1], str]:
pathogenic_count, _ = self._count_pathogenic_variants(self.seqvar, start_pos, end_pos)
if pathogenic_count >= 2:
self.comment += (
"Found 2 or more pathogenic variants in the UniProt domain. " "PM1 is met."
"Found 2 or more pathogenic variants in the UniProt domain. PM1 is met."
)
logger.debug(
"Found 2 or more pathogenic variants in the UniProt domain. PM1 is met."
Expand Down
2 changes: 1 addition & 1 deletion src/criteria/auto_pm4_bp3.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _in_repeat_region(self, seqvar: SeqVar) -> bool:
"""
self.comment += "Check if the variant is in a repeat region.\n"
try:
# Find path to the lib/hg19_repeatmasker.bed.gz file
# Find path to the lib file
if seqvar.genome_release == GenomeRelease.GRCh37:
path = os.path.join(settings.PATH_TO_ROOT, "lib", "rmsk", "grch37", "rmsk.bed.gz")
else:
Expand Down
14 changes: 7 additions & 7 deletions src/defs/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ class MissenseScore(BaseModel):


MissenseScores: List[MissenseScore] = [
MissenseScore(name="BayesDel_noAF_score", benign_threshold=-0.36, pathogenic_threshold=0.27),
MissenseScore(name="REVEL_score", benign_threshold=0.183, pathogenic_threshold=0.773),
MissenseScore(name="CADD_raw", benign_threshold=0.15, pathogenic_threshold=28.1),
MissenseScore(name="PrimateAI_score", benign_threshold=0.362, pathogenic_threshold=0.867),
MissenseScore(name="BayesDel_noAF_score", benign_threshold=-0.476, pathogenic_threshold=0.521),
MissenseScore(name="REVEL_score", benign_threshold=0.133, pathogenic_threshold=0.946),
MissenseScore(name="CADD_raw", benign_threshold=16.1, pathogenic_threshold=33),
MissenseScore(name="PrimateAI_score", benign_threshold=0.362, pathogenic_threshold=0.895),
# MissenseScore(name="FATHMM", benign_threshold=4.69, pathogenic_threshold=-5.04), # is <=
# MissenseScore(name="MutPred2", benign_threshold=0.197, pathogenic_threshold=0.829), # We don't have the right score for this
MissenseScore(name="Polyphen2_HVAR_score", benign_threshold=0.009, pathogenic_threshold=0.999),
MissenseScore(name="Polyphen2_HVAR_score", benign_threshold=0.001, pathogenic_threshold=1),
# MissenseScore(name="SIFT_score", benign_threshold=0.327, pathogenic_threshold=0.001), # is <=
MissenseScore(name="VEST4_score", benign_threshold=0.302, pathogenic_threshold=0.861),
# MissenseScore(name="VEST4_score", benign_threshold=0.302, pathogenic_threshold=0.861),
MissenseScore(
name="phyloP100way_vertebrate", benign_threshold=0.021, pathogenic_threshold=9.741
name="phyloP100way_vertebrate", benign_threshold=-1.04, pathogenic_threshold=9.88
), # Not sure about 100/470/17 way
]

Expand Down
2 changes: 1 addition & 1 deletion tests/criteria/test_auto_pp3_bp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_is_pathogenic_score_missing_data(file_path, seqvar, variant_info):
@pytest.mark.parametrize(
"file_path, expected",
[
("annonars/PAH_variant.json", False),
("annonars/PAH_variant.json", True),
],
)
def test_is_benign_score(file_path, expected, seqvar, variant_info):
Expand Down
45 changes: 43 additions & 2 deletions tests/integ/test_integ_ba1_bs1_bs2_pm2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Integration tests for `BP7` criteria using upstream server."""
"""Integration tests for `BA1`, `BS1`, `BS2`, `PM2` criteria using upstream server."""

from typing import Tuple

Expand Down Expand Up @@ -26,7 +26,48 @@
("NM_000277.2(PAH):c.503delA", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.814G>T", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.1162G>A", GenomeRelease.GRCh37, [False, False, False, True]),
("NM_000277.1(PAH):c.722G>A", GenomeRelease.GRCh37, [False, False, False, True]),
# ("NM_000277.1(PAH):c.722G>A", GenomeRelease.GRCh37, [False, False, False, True]),
# ("NM_000277.2(PAH):c.735G>A", GenomeRelease.GRCh37, (True, False, True, False)),
# ("NM_000314.6(PTEN):c.-1311T>C", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000314.6(PTEN):c.-903G>A", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000314.6(PTEN):c.-9C>G", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_206933.2(USH2A):c.15562A>G", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_004999.3(MYO6):c.475G>A", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000260.3(MYO7A):c.324C>T", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_000257.3(MYH7):c.3036C>T", GenomeRelease.GRCh37, (True, False, False, False)),
# ("NM_005633.3(SOS1):c.1018C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_002880.3(RAF1):c.94A>G", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001754.4(RUNX1):c.423G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.704A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.935C>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000546.5(TP53):c.869G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_001754.4(RUNX1):c.179C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_001754.4(RUNX1):c.205G>C", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000540.2(RYR1):c.12884C>T", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000540.3(RYR1):c.6961A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000284.4(PDHA1):c.999A>C", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_004992.3(MECP2):c.1140G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_004992.3(MECP2):c.1030C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000527.4(LDLR):c.1323C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000277.3(PAH):c.399T>C", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1375G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1199C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.1196C>T", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_001110792.2(MECP2):c.915C>G", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_000051.3(ATM):c.1073A>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000051.4(ATM):c.1066-6T>G", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000051.4(ATM):c.3925G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NC_012920.1:m.8557G>A", GenomeRelease.GRCh37, (False, True, False, False)),
# ("NM_000545.8(HNF1A):c.1849G>A", GenomeRelease.GRCh37, (False, True, True, False)),
# ("NM_004086.2(COCH):c.151C>T", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_004700.3(KCNQ4):c.853G>A", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_206933.2(USH2A):c.4510dupA", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000441.1(SLC26A4):c.365dupT", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.350delC", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.58C>T", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.498C>A", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.521T>C", GenomeRelease.GRCh37, (False, False, False, True)),
# ("NM_000277.2(PAH):c.504C>A", GenomeRelease.GRCh37, (False, False, False, True)),
],
)
def test_ba1_bs1_bs2_pm2(
Expand Down
57 changes: 57 additions & 0 deletions tests/integ/test_integ_pm1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Integration tests for `PM1` criteria using upstream server."""

import pytest

from src.auto_acmg import AutoACMG
from src.core.config import Config
from src.criteria.auto_criteria import AutoACMGCriteria
from src.criteria.auto_pm1 import AutoPM1
from src.defs.annonars_variant import AnnonarsVariantResponse
from src.defs.genome_builds import GenomeRelease
from src.defs.seqvar import SeqVar


@pytest.mark.remote
@pytest.mark.parametrize(
"variant_name, genome_release, expected_prediction",
[
# ("NM_000540.3(RYR1):c.1589G>A", GenomeRelease.GRCh37, True),
("NM_004700.3(KCNQ4):c.825G>C", GenomeRelease.GRCh37, True),
("NM_000257.3(MYH7):c.1157A>G", GenomeRelease.GRCh37, True),
("NM_005343.3(HRAS):c.175G>A", GenomeRelease.GRCh37, True),
("NM_030662.3(MAP2K2):c.400T>C", GenomeRelease.GRCh37, True),
("NM_004333.4(BRAF):c.739T>G", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.316T>A", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.314A>C", GenomeRelease.GRCh37, True),
("NM_001754.4:c.315C>A", GenomeRelease.GRCh37, True),
("NM_002834.4(PTPN11):c.782T>A", GenomeRelease.GRCh37, True),
("NM_004333.6(BRAF):c.793G>C", GenomeRelease.GRCh37, True),
("NM_005633.3(SOS1):c.1276C>A", GenomeRelease.GRCh37, True),
("NM_000546.5(TP53):c.396G>C", GenomeRelease.GRCh37, True),
("NM_005343.4(HRAS):c.175_176delinsCT", GenomeRelease.GRCh37, True),
("NM_001754.4(RUNX1):c.485G>A", GenomeRelease.GRCh37, True),
],
)
def test_pm1(
variant_name: str,
genome_release: GenomeRelease,
expected_prediction: bool,
config: Config,
):
# First, resolve variant
auto_acmg = AutoACMG(variant_name, genome_release, config=config)
seqvar = auto_acmg.resolve_variant()
assert isinstance(seqvar, SeqVar)
# Then, fetch the variant_info from Annonars
auto_acmg_criteria = AutoACMGCriteria(seqvar, config=config)
variant_info = auto_acmg_criteria._get_variant_info(seqvar)
assert isinstance(variant_info, AnnonarsVariantResponse)
# Then, predict PM1
auto_pm1 = AutoPM1(seqvar, variant_info.result, config=config)
prediction, details = auto_pm1.predict()
print(details)
if expected_prediction is None:
assert prediction is None, f"Failed for {variant_name}"
else:
assert prediction
assert prediction.PM1 == expected_prediction, f"Failed for {variant_name}"
Loading
Loading