Get error estimation and pooled inference working

h3abionet · Mar 31, 2024 · c475678 · c475678
1 parent e02d81c
commit c475678
Show file tree

Hide file tree

Showing 10 changed files with 553 additions and 288 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -25,12 +25,12 @@ process {
     withName: PLOTQUALITYPROFILE {
         publishDir = [
                 [
-                    path: { "${params.outdir}/Plots/" },
+                    path: { "${params.outdir}/Plots/QualityProfile" },
                     mode: params.publish_dir_mode,
                     pattern: "*.pdf"
                 ],
                 [
-                    path: { "${params.outdir}/R/Plots/" },
+                    path: { "${params.outdir}/R/QualityProfile" },
                     mode: params.publish_dir_mode,
                     pattern: "*.RDS"
                 ]
@@ -44,6 +44,28 @@ process {
         ]
     }
 
+    withName: MERGETRIMTABLES {
+        publishDir = [
+            path: { "${params.outdir}/TrimmedReads/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: LEARNERRORS {
+        publishDir = [
+            [
+                path: { "${params.outdir}/Plots/ErrorModel/" },
+                mode: params.publish_dir_mode,
+                pattern: "*.pdf"
+            ],
+            [
+                path: { "${params.outdir}/R/" },
+                mode: params.publish_dir_mode,
+                pattern: "*.RDS"
+            ]
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/modules/local/dadainfer.nf b/modules/local/dadainfer.nf
@@ -1,91 +1,64 @@
-// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :)
-//               https://github.com/nf-core/modules/tree/master/modules/nf-core/
-//               You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace:
-//               https://nf-co.re/join
-// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters.
-//               All other parameters MUST be provided using the "task.ext" directive, see here:
-//               https://www.nextflow.io/docs/latest/process.html#ext
-//               where "task.ext" is a string.
-//               Any parameters that need to be evaluated in the context of a particular sample
-//               e.g. single-end/paired-end data MUST also be defined and evaluated appropriately.
-// TODO nf-core: Software that can be piped together SHOULD be added to separate module files
-//               unless there is a run-time, storage advantage in implementing in this way
-//               e.g. it's ok to have a single module for bwa to output BAM instead of SAM:
-//                 bwa mem | samtools view -B -T ref.fasta
-// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty
-//               list (`[]`) instead of a file can be used to work around this issue.
-
 process DADAINFER {
-    tag "$meta.id"
+    tag "$readmode"    
     label 'process_medium'
 
-    // TODO nf-core: List required Conda package(s).
-    //               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
-    //               For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems.
-    // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below.
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE':
-        'biocontainers/YOUR-TOOL-HERE' }"
+    container "ghcr.io/h3abionet/tada:dev"
 
     input:
-    // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group"
-    //               MUST be provided as an input via a Groovy Map called "meta".
-    //               This information may not be required in some instances e.g. indexing reference genome files:
-    //               https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf
-    // TODO nf-core: Where applicable please provide/convert compressed files as input/output
-    //               e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc.
-    tuple val(meta), path(bam)
+    tuple val(readmode), file(err), file(reads)
 
     output:
-    // TODO nf-core: Named file extensions MUST be emitted for ALL output channels
-    tuple val(meta), path("*.bam"), emit: bam
     // TODO nf-core: List additional required output channels/values here
-    path "versions.yml"           , emit: versions
+    // path "versions.yml"           , emit: versions
+    path("all.dd.${readmode}.RDS"), emit: inferred
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10
-    //               If the software is unable to output a version number on the command-line then it can be manually specified
-    //               e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf
-    //               Each software used MUST provide the software name and version number in the YAML version file (versions.yml)
-    // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive
-    // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter
-    //               using the Nextflow "task" variable e.g. "--threads $task.cpus"
-    // TODO nf-core: Please replace the example samtools command below with your module's command
-    // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;)
+    def dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA'
     """
-    samtools \\
-        sort \\
-        $args \\
-        -@ $task.cpus \\
-        -o ${prefix}.bam \\
-        -T $prefix \\
-        $bam
+    #!/usr/bin/env Rscript
+    suppressPackageStartupMessages(library(dada2))
+
+    dadaOpt <- ${dadaOpt}
+
+    if (!is.na(dadaOpt)) {
+      setDadaOpt(dadaOpt)
+      cat("dada Options:\\n",dadaOpt,"\\n")
+    }
+
+    filts <- list.files('.', pattern="${readmode}.filtered.fastq.gz", full.names = TRUE)
+
+    err <- readRDS("${err}")
+    cat("Processing all samples\\n")
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        dadainfer: \$(samtools --version |& sed '1!d ; s/samtools //')
-    END_VERSIONS
+    #Variable selection from CLI input flag --pool
+    pool <- "${params.pool}"
+
+    # 'pool' is a weird flag, either 'pseudo' (string), or T/F (bool)
+    if(pool != "pseudo"){
+      pool <- as.logical(pool)
+    }
+
+    dereps <- derepFastq(filts, n=100000, verbose=TRUE)
+
+    cat(paste0("Denoising ${readmode} reads: pool:", pool, "\\n"))
+    dds <- dada(dereps, err=err, multithread=${task.cpus}, pool=pool)
+
+    saveRDS(dds, "all.dd.${readmode}.RDS")
     """
 
     stub:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
+
     // TODO nf-core: A stub section should mimic the execution of the original module as best as possible
     //               Have a look at the following examples:
     //               Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63
     //               Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54
     """
-    touch ${prefix}.bam
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        dadainfer: \$(samtools --version |& sed '1!d ; s/samtools //')
-    END_VERSIONS
+    # add some real stuff here
+    touch all.dd.${readmode}.RDS
     """
 }
diff --git a/modules/local/filterandtrim.nf b/modules/local/filterandtrim.nf
@@ -17,7 +17,7 @@
 
 process FILTERANDTRIM {
     tag "$meta.id"
-    label 'FILTERANDTRIM'
+    label 'process_medium'
 
     // TODO nf-core: List required Conda package(s).
     //               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
@@ -37,8 +37,13 @@ process FILTERANDTRIM {
 
     output:
     // TODO nf-core: Named file extensions MUST be emitted for ALL output channels
-    tuple val(meta), path("*.fastq.gz"), emit: trimmed_reads
-    tuple val(meta), path("*.txt"), emit: trimmed_report
+    // tuple val(meta), path("*.fastq.gz"), emit: trimmed_reads
+    // tuple val(meta), path("*.txt"), emit: trimmed_report
+    tuple val(meta), path("${meta.id}.R1.filtered.fastq.gz"), optional: true, emit: trimmed_R1
+    tuple val(meta), path("${meta.id}.R2.filtered.fastq.gz"), optional: true, emit: trimmed_R2
+    tuple val(meta), path("${meta.id}.R[12].filtered.fastq.gz"), optional: true, emit: trimmed
+    path("*.trimmed.txt"), emit: trimmed_report
+
     // TODO nf-core: List additional required output channels/values here
     // path "versions.yml"           , emit: versions
 

diff --git a/modules/local/learnerrors.nf b/modules/local/learnerrors.nf
@@ -1,91 +1,75 @@
-// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :)
-//               https://github.com/nf-core/modules/tree/master/modules/nf-core/
-//               You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace:
-//               https://nf-co.re/join
-// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters.
-//               All other parameters MUST be provided using the "task.ext" directive, see here:
-//               https://www.nextflow.io/docs/latest/process.html#ext
-//               where "task.ext" is a string.
-//               Any parameters that need to be evaluated in the context of a particular sample
-//               e.g. single-end/paired-end data MUST also be defined and evaluated appropriately.
-// TODO nf-core: Software that can be piped together SHOULD be added to separate module files
-//               unless there is a run-time, storage advantage in implementing in this way
-//               e.g. it's ok to have a single module for bwa to output BAM instead of SAM:
-//                 bwa mem | samtools view -B -T ref.fasta
-// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty
-//               list (`[]`) instead of a file can be used to work around this issue.
-
 process LEARNERRORS {
-    tag "$meta.id"
-    label 'process_me'
+    tag "$readmode"
+    label 'process_medium'
 
-    // TODO nf-core: List required Conda package(s).
-    //               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
-    //               For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems.
-    // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below.
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE':
-        'biocontainers/YOUR-TOOL-HERE' }"
+    container "ghcr.io/h3abionet/tada:dev"
 
     input:
-    // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group"
-    //               MUST be provided as an input via a Groovy Map called "meta".
-    //               This information may not be required in some instances e.g. indexing reference genome files:
-    //               https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf
-    // TODO nf-core: Where applicable please provide/convert compressed files as input/output
-    //               e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc.
-    tuple val(meta), path(bam)
+    tuple val(readmode), file(reads)
 
     output:
-    // TODO nf-core: Named file extensions MUST be emitted for ALL output channels
-    tuple val(meta), path("*.bam"), emit: bam
     // TODO nf-core: List additional required output channels/values here
-    path "versions.yml"           , emit: versions
+    // path "versions.yml"           , emit: versions
+    tuple val(readmode), file("errors.${readmode}.RDS"), emit: error_models
+    // path("errors.R[12].RDS"), emit: errorModelsPerSample
+    path("${readmode}.err.pdf"), emit: pdf
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10
-    //               If the software is unable to output a version number on the command-line then it can be manually specified
-    //               e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf
-    //               Each software used MUST provide the software name and version number in the YAML version file (versions.yml)
-    // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive
-    // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter
-    //               using the Nextflow "task" variable e.g. "--threads $task.cpus"
-    // TODO nf-core: Please replace the example samtools command below with your module's command
-    // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;)
+    def prefix = task.ext.prefix
+    def derepreads = 100000
+    // TODO: not even sure the below will work w/ nf-core style parameters, 
+    // may need to go to a string or args-type modifications
+    def dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA'
+    // TODO: will need to if-else this when implementing alternative error models
     """
-    samtools \\
-        sort \\
-        $args \\
-        -@ $task.cpus \\
-        -o ${prefix}.bam \\
-        -T $prefix \\
-        $bam
+    #!/usr/bin/env Rscript
+    suppressPackageStartupMessages(library(dada2))
+
+    dadaOpt <- ${dadaOpt}
+
+    if (!is.na(dadaOpt)) {
+        setDadaOpt(dadaOpt)
+        cat("dada Options:\n",dadaOpt,"\n")
+    }
+
+    # File parsing (these come from the process input channel)
+    filts <- list.files('.', pattern=paste0("${readmode}",".filtered.fastq.gz"), full.names = TRUE)
+    sample.names <- sapply(strsplit(basename(filts), "_"), `[`, 1) # Assumes filename = samplename_XXX.fastq.gz
+    set.seed(100)
+
+    # Learn forward error rates
+    err <- learnErrors(filts, multithread=${task.cpus}, verbose=1)
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        learnerrors: \$(samtools --version |& sed '1!d ; s/samtools //')
-    END_VERSIONS
+    # This is a rough correction for NovaSeq binning issues
+    # See https://github.com/h3abionet/TADA/issues/31, we'll likely
+    # add alternatives here soon
+
+    if (as.logical("${params.quality_binning}") == TRUE ) {
+        # TODO: add alternatives for binning
+        print("Running binning correction")
+        errs <- t(apply(getErrors(err), 1, function(x) { x[x < x[40]] = x[40]; return(x)} ))
+        err\$err_out <- errs
+    }
+
+    pdf(paste0("${readmode}",".err.pdf"))
+    plotErrors(err, nominalQ=TRUE)
+    dev.off()
+
+    saveRDS(err, paste0("errors.","${readmode}",".RDS"))
     """
 
     stub:
     def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
+    def prefix = task.ext.prefix
     // TODO nf-core: A stub section should mimic the execution of the original module as best as possible
     //               Have a look at the following examples:
     //               Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63
     //               Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54
     """
-    touch ${prefix}.bam
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        learnerrors: \$(samtools --version |& sed '1!d ; s/samtools //')
-    END_VERSIONS
+    # TODO: make a proper stub
     """
 }