bihealth · gromdimon · Aug 12, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/.env.dev b/.env.dev
@@ -1,4 +1,4 @@
 DEBUG=1
 USE_CACHE=1
 API_REEV_URL=https://reev.cubi.bihealth.org/internal/proxy
-SEQREPO_DATA_DIR=/home/gromdimon/Custom/seqrepo/master
+SEQREPO_DATA_DIR=<path_to_seqrepo_data_dir>
diff --git a/.gitattributes b/.gitattributes
@@ -4,6 +4,6 @@ tests/assets/**/*.json filter=lfs diff=lfs merge=lfs -text
 lib/rmsk/**/*.gz filter=lfs diff=lfs merge=lfs -text
 lib/rmsk/**/*.tbi filter=lfs diff=lfs merge=lfs -text
 lib/uniprot/**/*.gz filter=lfs diff=lfs merge=lfs -text
-lib/uniprot/**/*.tbi filter=lfs diff=lfs merge=lfs -text\
+lib/uniprot/**/*.tbi filter=lfs diff=lfs merge=lfs -text
 tests/e2e/cassettes/**/* filter=lfs diff=lfs merge=lfs -text
 tests/integ/cassettes/**/**/* filter=lfs diff=lfs merge=lfs -text
diff --git a/Pipfile b/Pipfile
@@ -14,6 +14,7 @@ msgpack = "*"
 seqrepo = "*"
 pytabix = "*"
 httpx = "*"
+yoyo-migrations = "*"
 
 [dev-packages]
 mypy = "*"
@@ -41,6 +42,7 @@ seaborn = "*"
 pytest-env = "*"
 pytest-httpx = "*"
 pytest-recording = "*"
+pytest-asyncio = "*"
 
 [requires]
 python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/codecov.yml b/codecov.yml
@@ -18,10 +18,14 @@ coverage:
 # Ignoring Paths
 # --------------
 # which folders/files to ignore
+ignore:
+  - "test/*"
+  - "src/bench/*"
+  - "src/core/cache.py"
 
 # Pull request comments:
 # ----------------------
 # Diff is the Coverage Diff of the pull request.
 # Files are the files impacted by the pull request
 comment:
-  layout: diff, files  # accepted in any order: reach, diff, flags, and/or files
+  layout: diff, files  # accepted in any order: reach, diff, flags, and/or files
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,9 @@ env = [
     "USE_CACHE=0",
     "API_REEV_URL=https://reev.cubi.bihealth.org/internal/proxy",
 ]
+filterwarnings = [
+    "ignore::DeprecationWarning"
+]
 
 
 [tool.mypy]

diff --git a/src/auto_acmg.py b/src/auto_acmg.py
@@ -261,6 +261,7 @@ def predict(self) -> Optional[AutoACMGResult]:
         """
         logger.info("Predicting ACMG criteria for variant: {}", self.variant_name)
         self.seqvar = self.resolve_variant()
+        self.result.seqvar = self.seqvar
         if not self.seqvar:
             logger.error("Unable to make a prediction for the variant: {}", self.variant_name)
             return None

diff --git a/src/criteria/auto_pm1.py b/src/criteria/auto_pm1.py
@@ -98,30 +98,18 @@ def _get_uniprot_domain(seqvar: SeqVar) -> Optional[Tuple[int, int]]:
         Returns:
             Optional[Tuple[int, int]]: The start and end positions of the UniProt domain if found,
             None otherwise.
-
-        Raises:
-            AlgorithmError: If an error occurs while checking if the variant is in a UniProt domain.
         """
-        try:
-            # Find path to the lib file
-            if seqvar.genome_release == GenomeRelease.GRCh37:
-                path = os.path.join(
-                    settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz"
-                )
-            else:
-                path = os.path.join(
-                    settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz"
-                )
-            tb = tabix.open(path)
-            records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos)
-            # Return the first record
-            for record in records:
-                return int(record[1]), int(record[2])
-            return None
-
-        except tabix.TabixError as e:
-            logger.error("Failed to check if the variant is in a UniProt domain. Error: {}", e)
-            raise AlgorithmError("Failed to check if the variant is in a UniProt domain.") from e
+        # Find path to the lib file
+        if seqvar.genome_release == GenomeRelease.GRCh37:
+            path = os.path.join(settings.PATH_TO_ROOT, "lib", "uniprot", "grch37", "uniprot.bed.gz")
+        else:
+            path = os.path.join(settings.PATH_TO_ROOT, "lib", "uniprot", "grch38", "uniprot.bed.gz")
+        tb = tabix.open(path)
+        records = tb.query(f"chr{seqvar.chrom}", seqvar.pos - 1, seqvar.pos)
+        # Return the first record
+        for record in records:
+            return int(record[1]), int(record[2])
+        return None
 
     def verify_pm1(self, seqvar: SeqVar, var_data: AutoACMGData) -> Tuple[Optional[PM1], str]:
         """Predict PM1 criteria."""

diff --git a/src/criteria/auto_pm4_bp3.py b/src/criteria/auto_pm4_bp3.py
@@ -3,7 +3,7 @@
 import os
 from typing import Optional, Tuple
 
-import tabix  # type: ignore
+import tabix
 from loguru import logger
 
 from src.core.config import settings

diff --git a/src/criteria/auto_pvs1.py b/src/criteria/auto_pvs1.py
@@ -280,7 +280,7 @@ def undergo_nmd(
         self,
         var_pos: int,
         hgnc_id: str,
-        strand: Optional[GenomicStrand],
+        strand: GenomicStrand,
         exons: List[Exon],
     ) -> bool:
         """
@@ -317,7 +317,7 @@ def undergo_nmd(
                 "Always predicted to undergo NMD."
             )
             return True
-        if not strand or not exons:
+        if strand == GenomicStrand.NotSet or not exons:
             logger.error("Strand information or exons are not available. Cannot determine NMD.")
             raise MissingDataError(
                 "Strand information or exons are not available. Cannot determine NMD."
@@ -344,7 +344,7 @@ def undergo_nmd(
         )
         return var_pos <= nmd_cutoff
 
-    def in_bio_relevant_tsx(self, transcript_tags: List[str]) -> bool:
+    def in_bio_relevant_tx(self, transcript_tags: List[str]) -> bool:
         """
         Checks if the exon with SeqVar is in a biologically relevant transcript.
 
@@ -369,7 +369,7 @@ def crit4prot_func(
         self,
         seqvar: SeqVar,
         exons: List[Exon],
-        strand: Optional[GenomicStrand],
+        strand: GenomicStrand,
     ) -> bool:
         """
         Checks if the truncated or altered region is critical for the protein function.
@@ -404,7 +404,7 @@ def crit4prot_func(
             InvalidAPIResponseError: If the API response is invalid or cannot be processed.
         """
         logger.debug("Checking if the altered region is critical for the protein function.")
-        if not strand or not exons:
+        if strand == GenomicStrand.NotSet or not exons:
             logger.error(
                 "Genomic strand or exons are not available. " "Cannot determine criticality."
             )
@@ -446,7 +446,7 @@ def lof_freq_in_pop(
         self,
         seqvar: SeqVar,
         exons: List[Exon],
-        strand: Optional[GenomicStrand],
+        strand: GenomicStrand,
     ) -> bool:
         """
         Checks if the Loss-of-Function (LoF) variants in the exon are frequent in the general
@@ -479,7 +479,7 @@ def lof_freq_in_pop(
             InvalidAPIResponseError: If the API response is invalid or cannot be processed.
         """
         logger.debug("Checking if LoF variants are frequent in the general population.")
-        if not strand:
+        if strand == GenomicStrand.NotSet:
             logger.error("Genomic strand is not available. Cannot determine LoF frequency.")
             raise MissingDataError(
                 "Genomic strand position is not available. Cannot determine LoF frequency."
@@ -544,7 +544,7 @@ def exon_skip_or_cryptic_ss_disrupt(
         seqvar: SeqVar,
         exons: List[Exon],
         consequences: List[str],
-        strand: Optional[GenomicStrand],
+        strand: GenomicStrand,
     ) -> bool:
         """
         Check if the variant causes exon skipping or cryptic splice site disruption.
@@ -575,12 +575,12 @@ def exon_skip_or_cryptic_ss_disrupt(
         logger.debug(
             "Checking if the variant causes exon skipping or cryptic splice site disruption."
         )
-        if not strand:
+        if strand == GenomicStrand.NotSet:
             logger.error("Strand is not available. Cannot determine exon skipping.")
             raise MissingDataError("Strand is not available. Cannot determine exon skipping.")
         start_pos, end_pos = self._skipping_exon_pos(seqvar, exons)
         self.comment_pvs1 += f"Variant's exon position: {start_pos} - {end_pos}."
-        if (end_pos - start_pos) % 3 != 0:
+        if (end_pos - start_pos + 1) % 3 != 0:
             logger.debug("Exon length is not a multiple of 3. Predicted to cause exon skipping.")
             self.comment_pvs1 += (
                 "Exon length is not a multiple of 3. Predicted to cause exon skipping."
@@ -599,14 +599,14 @@ def exon_skip_or_cryptic_ss_disrupt(
         cryptic_sites = sp.get_cryptic_ss(refseq, splice_type)
         if len(cryptic_sites) > 0:
             for site in cryptic_sites:
-                if abs(site[0] - seqvar.pos) % 3 != 0:
+                if abs(site[0] - seqvar.pos + 1) % 3 != 0:
                     logger.debug("Cryptic splice site disruption predicted.")
                     self.comment_pvs1 += (
                         "Cryptic splice site disruption predicted. "
                         f"Cryptic splice site: position {site[0]}, splice context {site[1]}, "
                         f"maximnum entropy score {site[2]}. "
                         f"Cryptic splice site - variant position ({seqvar.pos}) = "
-                        f"{abs(site[0] - seqvar.pos)} is not devisible by 3."
+                        f"{abs(site[0] - seqvar.pos + 1)} is not devisible by 3."
                     )
                     return True
             logger.debug("Cryptic splice site disruption not predicted.")
@@ -616,7 +616,7 @@ def exon_skip_or_cryptic_ss_disrupt(
                     f"Cryptic splice site {i}: position {site[0]}, splice context {site[1]}, "
                     f"maximnum entropy score {site[2]}. "
                     f"Cryptic splice site - variant position ({seqvar.pos}) = "
-                    f"{abs(site[0] - seqvar.pos)} is devisible by 3."
+                    f"{abs(site[0] - seqvar.pos + 1)} is devisible by 3."
                 )
         else:
             logger.debug("No cryptic splice site found. Predicted to preserve reading frame.")
@@ -662,9 +662,7 @@ def up_pathogenic_vars(
         self,
         seqvar: SeqVar,
         exons: List[Exon],
-        strand: Optional[GenomicStrand],
-        cds_info: Dict[str, CdsInfo],
-        hgvs: str,
+        strand: GenomicStrand,
     ) -> bool:
         """
         Look for pathogenic variants upstream of the closest potential in-frame start codon.
@@ -680,8 +678,6 @@ def up_pathogenic_vars(
             cds_pos: The position of the variant in the coding sequence.
             exons: A list of exons of the gene where the variant occurs.
             strand: The genomic strand of the gene.
-            cds_info: A dictionary containing the CDS information for all transcripts.
-            hgvs: The main transcript ID.
 
         Returns:
             bool: True if pathogenic variants are found upstream of the closest potential in-frame
@@ -691,7 +687,7 @@ def up_pathogenic_vars(
             "Checking for pathogenic variants upstream of the closest in-frame start codon."
         )
 
-        if not strand:
+        if strand == GenomicStrand.NotSet:
             logger.error("Strand is not available. Cannot determine upstream pathogenic variants.")
             raise MissingDataError(
                 "Strand is not available. Cannot determine upstream pathogenic variants."
@@ -772,7 +768,7 @@ def verify_pvs1(
                 var_data.tx_pos_utr, var_data.hgnc_id, var_data.strand, var_data.exons
             ):
                 self.comment_pvs1 += " =>\n"
-                if self.in_bio_relevant_tsx(var_data.transcript_tags):
+                if self.in_bio_relevant_tx(var_data.transcript_tags):
                     self.prediction = PVS1Prediction.PVS1
                     self.prediction_path = PVS1PredictionSeqVarPath.NF1
                 else:
@@ -787,7 +783,7 @@ def verify_pvs1(
                     self.comment_pvs1 += " =>\n"
                     if self.lof_freq_in_pop(
                         seqvar, var_data.exons, var_data.strand
-                    ) or not self.in_bio_relevant_tsx(var_data.transcript_tags):
+                    ) or not self.in_bio_relevant_tx(var_data.transcript_tags):
                         self.prediction = PVS1Prediction.NotPVS1
                         self.prediction_path = PVS1PredictionSeqVarPath.NF4
                     else:
@@ -807,7 +803,7 @@ def verify_pvs1(
                 var_data.tx_pos_utr, var_data.hgnc_id, var_data.strand, var_data.exons
             ):
                 self.comment_pvs1 += " =>\n"
-                if self.in_bio_relevant_tsx(var_data.transcript_tags):
+                if self.in_bio_relevant_tx(var_data.transcript_tags):
                     self.prediction = PVS1Prediction.PVS1
                     self.prediction_path = PVS1PredictionSeqVarPath.SS1
                 else:
@@ -826,7 +822,7 @@ def verify_pvs1(
                     self.comment_pvs1 += " =>\n"
                     if self.lof_freq_in_pop(
                         seqvar, var_data.exons, var_data.strand
-                    ) or not self.in_bio_relevant_tsx(var_data.transcript_tags):
+                    ) or not self.in_bio_relevant_tx(var_data.transcript_tags):
                         self.prediction = PVS1Prediction.NotPVS1
                         self.prediction_path = PVS1PredictionSeqVarPath.SS4
                     else:
@@ -846,7 +842,7 @@ def verify_pvs1(
                     self.comment_pvs1 += " =>\n"
                     if self.lof_freq_in_pop(
                         seqvar, var_data.exons, var_data.strand
-                    ) or not self.in_bio_relevant_tsx(var_data.transcript_tags):
+                    ) or not self.in_bio_relevant_tx(var_data.transcript_tags):
                         self.prediction = PVS1Prediction.NotPVS1
                         self.prediction_path = PVS1PredictionSeqVarPath.SS7
                     else:
@@ -869,8 +865,6 @@ def verify_pvs1(
                     seqvar,
                     var_data.exons,
                     var_data.strand,
-                    var_data.cds_info,
-                    var_data.transcript_id,
                 ):
                     self.prediction = PVS1Prediction.PVS1_Moderate
                     self.prediction_path = PVS1PredictionSeqVarPath.IC1