Adding rule to align against filtered, high-conf sqanti novel/known t…

…ranscriptome
OpenOmics · Sep 19, 2023 · 8425db1 · 8425db1
1 parent 1d0d08c
commit 8425db1
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 1 deletion.
diff --git a/config/cluster.json b/config/cluster.json
@@ -142,5 +142,14 @@
         "threads": "16",
         "mem": "64g",
         "time": "0-12:00:00"
+    },
+    "sqanti_minimap2": {
+        "threads": "4",
+        "mem": "32g",
+        "time": "0-08:00:00"
+    },
+    "sqanti_nanocount": {
+        "threads": "2",
+        "mem": "16g"
     }
 }
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -268,6 +268,12 @@ rule all:
         # Filters characterized transcripts from Sqanti QC
         # @imported from `rule sqanti_ml_filter` in rules/sqanti.smk
         join(workpath, "project", "counts", "novel", "sqanti.isoforms_MLresult_classification.txt"),
+        # Align to filtered, high-confidence SQANTI3 transcriptome
+        # @imported from `rules sqanti_minimap2` in rules/sqanti.smk
+        expand(
+            join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"),
+            name=samples
+        ),
         # Nanopolish polyA tail length estimation,
         # needs index to map basecalled reads to
         # raw signal from the ONT sequencer

diff --git a/workflow/rules/sqanti.smk b/workflow/rules/sqanti.smk
@@ -108,4 +108,58 @@ rule sqanti_ml_filter:
         --gtf {input.gtf} \\
         --isoforms {input.fa} \\
         {input.txt}
-    """
+    """
+
+
+rule sqanti_minimap2:
+    """
+    Data-processing step to align direct RNA reads against filtered, high-
+    confidence annotated transcriptome from SQANTI.
+    @Input:
+        Nanofilt quality filtered FastQ file (scatter),
+        Transcriptomic FASTA file
+    @Output:
+        Transcriptomic BAM file
+    """
+    input:
+        fq  = join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"),
+        ref  = join(workpath, "project", "counts", "novel", "sqanti.isoforms.filtered.fasta"),
+    output:
+        bam = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"),
+        bai = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam.bai"),
+    params:
+        rname = 'sqanti_minimap2',
+        tmpdir = join(workpath, "{name}", "bams", "sqanti_transcriptome_tmp"),
+    conda: depending(join(workpath, config['conda']['modr']), use_conda)
+    container: depending(config['images']['modr'], use_singularity)
+    threads: int(allocated("threads", "sqanti_minimap2", cluster)) 
+    shell: 
+        """
+        # Setups temporary directory for
+        # intermediate files with built-in 
+        # mechanism for deletion on exit
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
+
+        # Align against reference genome,
+        # minimap2 automatically handles
+        # conversion of U to T bps, See 
+        # this issue for the author of 
+        # minimap2's recomendations for 
+        # aligning direct RNA reads against 
+        # the transcriptome:
+        # https://github.com/lh3/minimap2/issues/340  
+        minimap2 \\
+            -ax map-ont \\
+            -N 10 \\
+            -k10 \\
+            {input.ref} \\
+            {input.fq} \\
+        | samtools sort -@{threads} \\
+            -T "${{tmp}}" \\
+            -O bam \\
+            --write-index \\
+            -o {output.bam}##idx##{output.bai} \\
+            -
+        """