Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement "In biologically relevant transcript" method for strucvars #191

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 49 additions & 3 deletions src/api/mehari.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from src.core.config import settings
from src.defs.exceptions import MehariException
from src.defs.genome_builds import GenomeRelease
from src.defs.mehari import GeneTranscripts, TranscriptsSeqVar
from src.defs.mehari import GeneTranscripts, TranscriptsSeqVar, TranscriptsStrucVar
from src.defs.seqvar import SeqVar
from src.defs.strucvar import StrucVar

#: Mehari API base URL
MEHARI_API_BASE_URL = f"{settings.API_REEV_URL}/mehari"
Expand All @@ -33,7 +34,8 @@ def get_seqvar_transcripts(self, seqvar: SeqVar) -> TranscriptsSeqVar:
:param seqvar: Sequence variant
:type seqvar: SeqVar
:return: Transcripts
:rtype: TranscriptsSeqVar | None
:rtype: TranscriptsSeqVar
:raises MehariException: if the request failed
"""
url = (
f"{self.api_base_url}/seqvars/csq?"
Expand Down Expand Up @@ -68,6 +70,49 @@ def get_seqvar_transcripts(self, seqvar: SeqVar) -> TranscriptsSeqVar:
logger.exception("Validation failed: {}", e)
raise MehariException("Mehari API returned invalid data") from e

def get_strucvar_transcripts(self, strucvar: StrucVar) -> TranscriptsStrucVar:
"""
Get transcripts for a structural variant.

:param strucvar: Structural variant
:type strucvar: StrucVar
:return: Transcripts
:rtype: TranscriptsStrucVar
:raises MehariException: if the request failed
"""
url = (
f"{self.api_base_url}/strucvars/csq?"
f"genome_release={strucvar.genome_release.name.lower()}"
f"&chromosome={strucvar.chrom}"
f"&start={strucvar.start}"
f"&stop={strucvar.stop}"
f"&sv_type={strucvar.sv_type.name.upper()}"
)
logger.debug("GET request to: {}", url)

cached_response = self.cache.get(url)
if cached_response:
try:
return TranscriptsStrucVar.model_validate(cached_response)
except ValidationError as e:
logger.exception("Validation failed for cached data: {}", e)
raise MehariException("Cached data is invalid") from e

response = self.client.get(url)
if response.status_code != 200:
logger.error("Request failed: {}", response.text)
raise MehariException(
f"Request failed. Status code: {response.status_code}, Text: {response.text}"
)
try:
response.raise_for_status()
response_data = response.json()
self.cache.add(url, response_data)
return TranscriptsStrucVar.model_validate(response_data)
except ValidationError as e:
logger.exception("Validation failed: {}", e)
raise MehariException("Mehari API returned invalid data") from e

def get_gene_transcripts(self, hgnc_id: str, genome_build: GenomeRelease) -> GeneTranscripts:
""" "
Get transcripts for a gene.
Expand All @@ -77,7 +122,8 @@ def get_gene_transcripts(self, hgnc_id: str, genome_build: GenomeRelease) -> Gen
:param genome_build: Genome build
:type genome_build: GenomeRelease
:return: Transcripts
:rtype: GeneTranscripts | None
:rtype: GeneTranscripts
:raises MehariException: if the request failed
"""
genome_build_mapping = {
GenomeRelease.GRCh37: "GENOME_BUILD_GRCH37",
Expand Down
41 changes: 33 additions & 8 deletions src/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from src.seqvar.auto_pvs1 import AutoPVS1
from src.seqvar.default_predictor import DefaultSeqVarPredictor
from src.strucvar.default_predictor import DefaultStrucVarPredictor
from src.utils import SeqVarTranscriptsHelper
from src.utils import SeqVarTranscriptsHelper, StrucVarTranscriptsHelper
from src.vcep import (
ACADVLPredictor,
BrainMalformationsPredictor,
Expand Down Expand Up @@ -257,7 +257,6 @@ def _convert_score_val(self, score_value: Optional[Union[str, float, int]]) -> O
scores = [float(score) for score in score_value.split(";") if score != "."]
return max(scores) if scores else None
except ValueError as e:
logger.error("Failed to convert score value to float. Error: {}", e)
raise AlgorithmError("Failed to convert score value to float.") from e

def resolve_variant(self) -> Union[SeqVar, StrucVar, None]:
Expand Down Expand Up @@ -346,7 +345,6 @@ def parse_seqvar_data(self, seqvar: SeqVar) -> AutoACMGSeqVarResult:
# Annonars data
variant_info = self._get_variant_info(seqvar)
if not variant_info:
logger.error("Failed to get variant information.")
raise AutoAcmgBaseException("Failed to get variant information.")

if cadd := variant_info.cadd:
Expand Down Expand Up @@ -383,13 +381,36 @@ def parse_seqvar_data(self, seqvar: SeqVar) -> AutoACMGSeqVarResult:
self.seqvar_result.data.gnomad_exomes = variant_info.gnomad_exomes
self.seqvar_result.data.gnomad_mtdna = variant_info.gnomad_mtdna

# Thresholds
pass

# Gene info from Annonars
gene_info = self.annonars_client.get_gene_info(self.seqvar_result.data.hgnc_id)
if not gene_info:
raise AutoAcmgBaseException("Failed to get gene information.")
if gnomad_constraints := gene_info.genes.root[
self.seqvar_result.data.hgnc_id
].gnomadConstraints:
self.seqvar_result.data.scores.misZ = gnomad_constraints.misZ
return self.seqvar_result

def parse_strucvar_data(self, strucvar: StrucVar) -> AutoACMGStrucVarResult:
"""Parse the data for the prediction."""
# Mehari data
ts_helper = StrucVarTranscriptsHelper(strucvar)
ts_helper.initialize()
strucvar_transcript, gene_transcript, all_strucvar_tx, all_genes_tx = (
ts_helper.get_ts_info()
)
if not strucvar_transcript or not gene_transcript:
raise AutoAcmgBaseException("Transcript information is missing.")

self.strucvar_result.data.hgnc_id = strucvar_transcript.hgnc_id
self.strucvar_result.data.gene_symbol = gene_transcript.geneSymbol
self.strucvar_result.data.transcript_id = gene_transcript.id
self.strucvar_result.data.transcript_tags = gene_transcript.tags or []
self.strucvar_result.data.strand = GenomicStrand.from_string(
gene_transcript.genomeAlignments[0].strand
)
self.strucvar_result.data.exons = gene_transcript.genomeAlignments[0].exons

return self.strucvar_result

def select_predictor(self, hgnc_id: str) -> Type[DefaultSeqVarPredictor]:
Expand Down Expand Up @@ -442,7 +463,9 @@ def predict(self) -> Union[AutoACMGSeqVarResult, AutoACMGStrucVarResult, None]:
# ====== Predict ======
predictor_class = self.select_predictor(self.seqvar_result.data.hgnc_id)
predictor = predictor_class(self.seqvar, self.seqvar_result, self.config)
return predictor.predict()
seqvar_prediction = predictor.predict()
logger.info("Prediction: {}", seqvar_prediction)
return seqvar_prediction

elif isinstance(self.strucvar, StrucVar):
if not self.strucvar:
Expand All @@ -453,7 +476,9 @@ def predict(self) -> Union[AutoACMGSeqVarResult, AutoACMGStrucVarResult, None]:

# ====== Predict ======
sp = DefaultStrucVarPredictor(self.strucvar, self.strucvar_result, self.config)
return sp.predict()
strucvar_prediction = sp.predict()
# logger.info("Prediction: {}", strucvar_prediction)
return strucvar_prediction

else:
logger.info("Structural variants are not supported for ACMG criteria prediction yet.")
Expand Down
6 changes: 5 additions & 1 deletion src/defs/annonars_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@ class DecipherHi(BaseModel):
hiIndex: Optional[float] = None


class GnomadConstraints(BaseModel):
misZ: Optional[float] = None


class GeneInfo(BaseModel):
acmgSf: Optional[Any] = None
clingen: Optional[Clingen] = None
dbnsfp: Optional[Any] = None
gnomadConstraints: Optional[Any] = None
gnomadConstraints: Optional[GnomadConstraints] = None
hgnc: Optional[Any] = None
ncbi: Optional[Any] = None
omim: Optional[Any] = None
Expand Down
5 changes: 3 additions & 2 deletions src/defs/auto_acmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ class AutoACMGSeqVarScores(AutoAcmgBaseModel):
cadd: AutoACMGCADD = AutoACMGCADD()
dbnsfp: AutoACMGDbnsfp = AutoACMGDbnsfp()
dbscsnv: AutoACMGDbscsnv = AutoACMGDbscsnv()
misZ: Optional[float] = None


class AutoACMGSeqVarTresholds(AutoAcmgBaseModel):
Expand Down Expand Up @@ -653,8 +654,8 @@ class AutoACMGStrucVarData(AutoAcmgBaseModel):
hgnc_id: str = ""
transcript_id: str = ""
transcript_tags: List[str] = []
prot_pos: int = -1
prot_length: int = -1
# prot_pos: int = -1
# prot_length: int = -1
strand: GenomicStrand = GenomicStrand.NotSet
exons: List[Exon] = []

Expand Down
28 changes: 26 additions & 2 deletions src/defs/mehari.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class Version(BaseModel):
mehari: str


class Query(BaseModel):
class SeqvarQuery(BaseModel):
genome_release: str
chromosome: str
position: int
Expand Down Expand Up @@ -115,5 +115,29 @@ class TranscriptSeqvar(BaseModel):

class TranscriptsSeqVar(BaseModel):
version: Version
query: Query
query: SeqvarQuery
result: List[TranscriptSeqvar]


# ====================
# Mehari Strucvar
# ====================


class StrucvarQuery(BaseModel):
genome_release: str
chromosome: str
start: int
stop: int
sv_type: str


class TranscriptStrucvar(BaseModel):
hgnc_id: str
transcript_effects: List[str]


class TranscriptsStrucVar(BaseModel):
version: Version
query: StrucvarQuery
result: List[TranscriptStrucvar]
28 changes: 1 addition & 27 deletions src/seqvar/auto_pvs1.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@ def _count_pathogenic_vars(
"""
logger.debug("Counting pathogenic variants in the range {} - {}.", start_pos, end_pos)
if end_pos < start_pos:
logger.error("End position is less than the start position.")
logger.debug("Positions given: {} - {}", start_pos, end_pos)
raise AlgorithmError("End position is less than the start position.")

response = self.annonars_client.get_variant_from_range(seqvar, start_pos, end_pos)
Expand All @@ -171,7 +169,6 @@ def _count_pathogenic_vars(
)
return len(pathogenic_variants), len(response.clinvar)
else:
logger.error("Failed to get variant from range. No ClinVar data.")
raise InvalidAPIResposeError("Failed to get variant from range. No ClinVar data.")

def _find_aff_exon_pos(self, var_pos: int, exons: List[Exon]) -> Tuple[int, int]:
Expand Down Expand Up @@ -231,8 +228,6 @@ def _count_lof_vars(self, seqvar: SeqVar, start_pos: int, end_pos: int) -> Tuple
"""
logger.debug("Counting LoF variants in the range {} - {}.", start_pos, end_pos)
if end_pos < start_pos:
logger.error("End position is less than the start position.")
logger.debug("Positions given: {} - {}", start_pos, end_pos)
raise AlgorithmError("End position is less than the start position.")

response = self.annonars_client.get_variant_from_range(seqvar, start_pos, end_pos)
Expand Down Expand Up @@ -260,7 +255,6 @@ def _count_lof_vars(self, seqvar: SeqVar, start_pos: int, end_pos: int) -> Tuple
)
return frequent_lof_variants, lof_variants
else:
logger.error("Failed to get variant from range. No gnomAD genomes data.")
raise InvalidAPIResposeError(
"Failed to get variant from range. No gnomAD genomes data."
)
Expand All @@ -281,7 +275,6 @@ def _closest_alt_start_cdn(self, cds_info: Dict[str, CdsInfo], hgvs: str) -> Opt
"""
logger.debug("Checking if the variant introduces an alternative start codon.")
if hgvs not in cds_info: # Should never happen
logger.error("Main transcript ID {} not found in the transcripts data.", hgvs)
raise MissingDataError(f"Main transcript ID {hgvs} not found in the transcripts data.")

main_cds_start = (
Expand Down Expand Up @@ -330,7 +323,6 @@ def _skipping_exon_pos(self, seqvar: SeqVar, exons: List[Exon]) -> Tuple[int, in
end_pos = exon.altEndI
break
if not start_pos or not end_pos:
logger.error("Exon not found. Variant position: {}. Exons: {}", seqvar.pos, exons)
raise AlgorithmError("Exon not found.")
return start_pos, end_pos

Expand Down Expand Up @@ -376,7 +368,6 @@ def undergo_nmd(
)
return True
if strand == GenomicStrand.NotSet or not exons:
logger.error("Strand information or exons are not available. Cannot determine NMD.")
raise MissingDataError(
"Strand information or exons are not available. Cannot determine NMD."
)
Expand All @@ -386,16 +377,10 @@ def undergo_nmd(
tx_sizes = tx_sizes[::-1] # Reverse the exons

if len(tx_sizes) == 1:
logger.debug("The only exon. Predicted to undergo NMD.")
self.comment_pvs1 += "The only exon found. Predicted to undergo NMD."
return False

nmd_cutoff = sum(tx_sizes[:-1]) - min(50, tx_sizes[-2])
logger.debug(
"New stop codon: {}, NMD cutoff: {}.",
var_pos,
nmd_cutoff,
)
self.comment_pvs1 += (
f"New stop codon: {var_pos}, NMD cutoff: {nmd_cutoff}."
f"{'Predicted to undergo NMD.' if var_pos <= nmd_cutoff else 'Predicted to escape NMD.'}"
Expand All @@ -416,7 +401,6 @@ def in_bio_relevant_tx(self, transcript_tags: List[str]) -> bool:
Returns:
bool: True if the variant is in a biologically relevant transcript, False otherwise.
"""
logger.debug("Checking if the variant is in a biologically relevant transcript.")
self.comment_pvs1 += (
"Variant is in a biologically relevant transcript. "
f"Transcript tags: {', '.join(transcript_tags)}."
Expand Down Expand Up @@ -463,9 +447,6 @@ def crit4prot_func(
"""
logger.debug("Checking if the altered region is critical for the protein function.")
if strand == GenomicStrand.NotSet or not exons:
logger.error(
"Genomic strand or exons are not available. " "Cannot determine criticality."
)
raise MissingDataError(
"Genomic strand or exons are not available. " "Cannot determine criticality."
)
Expand Down Expand Up @@ -497,7 +478,6 @@ def crit4prot_func(
)
return False
except AutoAcmgBaseException as e:
logger.error("Failed to predict criticality for variant. Error: {}", e)
raise AlgorithmError("Failed to predict criticality for variant.") from e

def lof_freq_in_pop(
Expand Down Expand Up @@ -538,7 +518,6 @@ def lof_freq_in_pop(
"""
logger.debug("Checking if LoF variants are frequent in the general population.")
if strand == GenomicStrand.NotSet:
logger.error("Genomic strand is not available. Cannot determine LoF frequency.")
raise MissingDataError(
"Genomic strand position is not available. Cannot determine LoF frequency."
)
Expand Down Expand Up @@ -568,7 +547,6 @@ def lof_freq_in_pop(
)
return False
except AutoAcmgBaseException as e:
logger.error("Failed to predict LoF frequency for variant. Error: {}", e)
raise AlgorithmError("Failed to predict LoF frequency for variant.") from e

def lof_rm_gt_10pct_of_prot(self, prot_pos: int, prot_length: int) -> bool:
Expand Down Expand Up @@ -634,7 +612,6 @@ def exon_skip_or_cryptic_ss_disrupt(
"Checking if the variant causes exon skipping or cryptic splice site disruption."
)
if strand == GenomicStrand.NotSet:
logger.error("Strand is not available. Cannot determine exon skipping.")
raise MissingDataError("Strand is not available. Cannot determine exon skipping.")
start_pos, end_pos = self._skipping_exon_pos(seqvar, exons)
self.comment_pvs1 += f"Variant's exon position: {start_pos} - {end_pos}."
Expand Down Expand Up @@ -746,7 +723,6 @@ def up_pathogenic_vars(
)

if strand == GenomicStrand.NotSet:
logger.error("Strand is not available. Cannot determine upstream pathogenic variants.")
raise MissingDataError(
"Strand is not available. Cannot determine upstream pathogenic variants."
)
Expand All @@ -767,7 +743,6 @@ def up_pathogenic_vars(
)
return pathogenic_variants > 0
except AutoAcmgBaseException as e:
logger.error("Failed to check upstream pathogenic variants. Error: {}", e)
raise AlgorithmError("Failed to check upstream pathogenic variants.") from e


Expand Down Expand Up @@ -796,8 +771,7 @@ def _convert_consequence(self, var_data: AutoACMGSeqVarData) -> SeqVarPVS1Conseq
return SeqvarConsequenceMapping[var_data.consequence.cadd]
return SeqVarPVS1Consequence.NotSet

# pragma: no cover
def verify_pvs1(
def verify_pvs1( # pragma: no cover
self, seqvar: SeqVar, var_data: AutoACMGSeqVarData
) -> Tuple[PVS1Prediction, PVS1PredictionSeqVarPath, str]:
"""Make the PVS1 prediction.
Expand Down
Loading
Loading