From 28a5245a1d815cff6719d4d1591986fdec80611b Mon Sep 17 00:00:00 2001 From: gromdimon Date: Mon, 9 Sep 2024 15:07:36 +0200 Subject: [PATCH] Implement methods --- src/auto_acmg.py | 2 + src/defs/auto_acmg.py | 4 +- src/defs/mehari.py | 2 +- src/strucvar/auto_pvs1.py | 131 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 129 insertions(+), 10 deletions(-) diff --git a/src/auto_acmg.py b/src/auto_acmg.py index 2d10881..658adf0 100644 --- a/src/auto_acmg.py +++ b/src/auto_acmg.py @@ -432,6 +432,8 @@ def parse_strucvar_data(self, strucvar: StrucVar) -> AutoACMGStrucVarResult: gene_transcript.genomeAlignments[0].strand ) self.strucvar_result.data.exons = gene_transcript.genomeAlignments[0].exons + self.strucvar_result.data.start_cdn = gene_transcript.startCodon + self.strucvar_result.data.stop_cdn = gene_transcript.stopCodon return self.strucvar_result diff --git a/src/defs/auto_acmg.py b/src/defs/auto_acmg.py index 3521ea0..a9cf1bd 100644 --- a/src/defs/auto_acmg.py +++ b/src/defs/auto_acmg.py @@ -672,10 +672,10 @@ class AutoACMGStrucVarData(AutoAcmgBaseModel): hgnc_id: str = "" transcript_id: str = "" transcript_tags: List[str] = [] - # prot_pos: int = -1 - # prot_length: int = -1 strand: GenomicStrand = GenomicStrand.NotSet exons: List[Exon] = [] + start_cdn: int = 0 + stop_cdn: int = 0 class AutoACMGStrucVarPred(AutoAcmgBaseModel): diff --git a/src/defs/mehari.py b/src/defs/mehari.py index be38f08..9517756 100644 --- a/src/defs/mehari.py +++ b/src/defs/mehari.py @@ -18,7 +18,7 @@ class Exon(BaseModel): altEndI: int altCdsStartI: int altCdsEndI: int - cigar: str + cigar: Optional[str] = None ord: Optional[int] = None diff --git a/src/strucvar/auto_pvs1.py b/src/strucvar/auto_pvs1.py index ca76bfa..87e1ca8 100644 --- a/src/strucvar/auto_pvs1.py +++ b/src/strucvar/auto_pvs1.py @@ -119,7 +119,8 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]: strucvar: The structural variant being analyzed. Returns: - Tuple[int, int]: The number of frequent LoF variants and the total number of LoF variants. + Tuple[int, int]: The number of frequent LoF variants and the total number of LoF + variants. Raises: AlgorithmError: If the end position is less than the start position. @@ -157,6 +158,67 @@ def _count_lof_vars(self, strucvar: StrucVar) -> Tuple[int, int]: "Failed to get variant from range. No gnomAD genomes data." ) + def _calc_cds( + self, exons: List[Exon], strand: GenomicStrand, start_codon: int, stop_codon: int + ) -> List[Exon]: + """ + Remove UTRs from exons. + + Args: + exons: List of exons for the gene. + strand: The genomic strand of the gene. + start_codon: Position of the start codon. + stop_codon: Position of the stop codon. + + Returns: + List[Exon]: List of exons without UTRs. + + Raises: + MissingDataError: If the genomic strand is not set. + """ + if strand == GenomicStrand.NotSet: + raise MissingDataError("Genomic strand is not set. Cannot remove UTRs.") + + if strand == GenomicStrand.Plus: + # Remove 5' UTR + for i, exon in enumerate(exons): + if exon.altEndI - exon.altStartI + 1 > start_codon: + exon.altStartI += start_codon + break + else: + del exons[i] + start_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove 3' UTR + for i, exon in enumerate(exons[::-1]): + if exon.altEndI - exon.altStartI + 1 > stop_codon: + exon.altEndI -= stop_codon + break + else: + del exons[-i] + stop_codon -= exon.altEndI - exon.altStartI + 1 + + elif strand == GenomicStrand.Minus: + # Remove 3' UTR + for i, exon in enumerate(exons): + if exon.altEndI - exon.altStartI + 1 > stop_codon: + exon.altStartI += stop_codon + break + else: + del exons[i] + stop_codon -= exon.altEndI - exon.altStartI + 1 + + # Remove 5' UTR + for i, exon in enumerate(exons[::-1]): + if exon.altEndI - exon.altStartI + 1 > start_codon: + exon.altEndI -= start_codon + break + else: + del exons[-i] + start_codon -= exon.altEndI - exon.altStartI + 1 + + return exons + def full_gene_del(self, strucvar: StrucVar, exons: List[Exon]) -> bool: """ Check if the variant is a full gene deletion. @@ -445,10 +507,53 @@ def lof_freq_in_pop(self, strucvar: StrucVar) -> bool: except AutoAcmgBaseException as e: raise AlgorithmError("Failed to predict LoF frequency for structural variant.") from e - @staticmethod - def lof_rm_gt_10pct_of_prot() -> bool: - """Check if the loss-of-function removes more than 10% of the protein.""" - return False + def lof_rm_gt_10pct_of_prot( + self, + strucvar: StrucVar, + exons: List[Exon], + strand: GenomicStrand, + start_codon: int, + stop_codon: int, + ) -> bool: + """ + Determine if the deletion removes more than 10% of the protein-coding sequence. + + First remove the UTRs from the exons. Then iterate through the CDS exons and calculate the + total CDS length and the length of the deleted region. Return True if the deletion removes + more than 10% of the protein-coding sequence, False otherwise. + + Args: + strucvar: The structural variant being analyzed. + exons: List of exons for the gene. + strand: The genomic strand of the gene. + start_codon: Position of the start codon. + stop_codon: Position of the stop codon. + + Returns: + bool: True if the deletion removes more than 10% of the protein, False otherwise. + """ + total_cds_length = 0 + deleted_length = 0 + + cds = self._calc_cds(exons, strand, start_codon, stop_codon) + + for exon in cds: + total_cds_length += exon.altEndI - exon.altStartI + 1 + + overlap_start = max(strucvar.start, exon.altStartI) + overlap_end = min(strucvar.stop, exon.altEndI) + + if overlap_start <= overlap_end: # Check if there is an overlap + overlap_length = overlap_end - overlap_start + 1 + deleted_length += overlap_length + + if total_cds_length == 0: + raise AlgorithmError( + "Total CDS length is zero. Cannot determine if deletion removes " + "more than 10% of the protein." + ) + deletion_percentage = (deleted_length // 3) / (total_cds_length // 3) + return deletion_percentage > 0.1 @staticmethod def dup_disrupt_rf() -> bool: @@ -518,7 +623,13 @@ def verify_pvs1( # pragma: no cover self.prediction_path = PVS1PredictionStrucVarPath.DEL5_1 else: self.comment_pvs1 += " =>" - if self.lof_rm_gt_10pct_of_prot(): + if self.lof_rm_gt_10pct_of_prot( + strucvar, + var_data.exons, + var_data.strand, + var_data.start_cdn, + var_data.stop_cdn, + ): self.prediction = PVS1Prediction.PVS1_Strong self.prediction_path = PVS1PredictionStrucVarPath.DEL6_1 else: @@ -538,7 +649,13 @@ def verify_pvs1( # pragma: no cover self.prediction_path = PVS1PredictionStrucVarPath.DEL5_2 else: self.comment_pvs1 += " =>" - if self.lof_rm_gt_10pct_of_prot(): + if self.lof_rm_gt_10pct_of_prot( + strucvar, + var_data.exons, + var_data.strand, + var_data.start_cdn, + var_data.stop_cdn, + ): self.prediction = PVS1Prediction.PVS1_Strong self.prediction_path = PVS1PredictionStrucVarPath.DEL6_2 else: