diff --git a/conf/modules.config b/conf/modules.config index 2ae6a0b..ffb609b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -25,12 +25,12 @@ process { withName: PLOTQUALITYPROFILE { publishDir = [ [ - path: { "${params.outdir}/Plots/" }, + path: { "${params.outdir}/Plots/QualityProfile" }, mode: params.publish_dir_mode, pattern: "*.pdf" ], [ - path: { "${params.outdir}/R/Plots/" }, + path: { "${params.outdir}/R/QualityProfile" }, mode: params.publish_dir_mode, pattern: "*.RDS" ] @@ -44,6 +44,28 @@ process { ] } + withName: MERGETRIMTABLES { + publishDir = [ + path: { "${params.outdir}/TrimmedReads/" }, + mode: params.publish_dir_mode + ] + } + + withName: LEARNERRORS { + publishDir = [ + [ + path: { "${params.outdir}/Plots/ErrorModel/" }, + mode: params.publish_dir_mode, + pattern: "*.pdf" + ], + [ + path: { "${params.outdir}/R/" }, + mode: params.publish_dir_mode, + pattern: "*.RDS" + ] + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/modules/local/dadainfer.nf b/modules/local/dadainfer.nf index 52c04cd..40f2f51 100644 --- a/modules/local/dadainfer.nf +++ b/modules/local/dadainfer.nf @@ -1,91 +1,64 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process DADAINFER { - tag "$meta.id" + tag "$readmode" label 'process_medium' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + container "ghcr.io/h3abionet/tada:dev" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + tuple val(readmode), file(err), file(reads) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here - path "versions.yml" , emit: versions + // path "versions.yml" , emit: versions + path("all.dd.${readmode}.RDS"), emit: inferred when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + def dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA' """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + dadaOpt <- ${dadaOpt} + + if (!is.na(dadaOpt)) { + setDadaOpt(dadaOpt) + cat("dada Options:\\n",dadaOpt,"\\n") + } + + filts <- list.files('.', pattern="${readmode}.filtered.fastq.gz", full.names = TRUE) + + err <- readRDS("${err}") + cat("Processing all samples\\n") - cat <<-END_VERSIONS > versions.yml - "${task.process}": - dadainfer: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + #Variable selection from CLI input flag --pool + pool <- "${params.pool}" + + # 'pool' is a weird flag, either 'pseudo' (string), or T/F (bool) + if(pool != "pseudo"){ + pool <- as.logical(pool) + } + + dereps <- derepFastq(filts, n=100000, verbose=TRUE) + + cat(paste0("Denoising ${readmode} reads: pool:", pool, "\\n")) + dds <- dada(dereps, err=err, multithread=${task.cpus}, pool=pool) + + saveRDS(dds, "all.dd.${readmode}.RDS") """ stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + // TODO nf-core: A stub section should mimic the execution of the original module as best as possible // Have a look at the following examples: // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - dadainfer: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + # add some real stuff here + touch all.dd.${readmode}.RDS """ } diff --git a/modules/local/filterandtrim.nf b/modules/local/filterandtrim.nf index 180c2c4..d5ce8e6 100644 --- a/modules/local/filterandtrim.nf +++ b/modules/local/filterandtrim.nf @@ -17,7 +17,7 @@ process FILTERANDTRIM { tag "$meta.id" - label 'FILTERANDTRIM' + label 'process_medium' // TODO nf-core: List required Conda package(s). // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). @@ -37,8 +37,13 @@ process FILTERANDTRIM { output: // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.fastq.gz"), emit: trimmed_reads - tuple val(meta), path("*.txt"), emit: trimmed_report + // tuple val(meta), path("*.fastq.gz"), emit: trimmed_reads + // tuple val(meta), path("*.txt"), emit: trimmed_report + tuple val(meta), path("${meta.id}.R1.filtered.fastq.gz"), optional: true, emit: trimmed_R1 + tuple val(meta), path("${meta.id}.R2.filtered.fastq.gz"), optional: true, emit: trimmed_R2 + tuple val(meta), path("${meta.id}.R[12].filtered.fastq.gz"), optional: true, emit: trimmed + path("*.trimmed.txt"), emit: trimmed_report + // TODO nf-core: List additional required output channels/values here // path "versions.yml" , emit: versions diff --git a/modules/local/learnerrors.nf b/modules/local/learnerrors.nf index 4be6b96..caaf4fe 100644 --- a/modules/local/learnerrors.nf +++ b/modules/local/learnerrors.nf @@ -1,91 +1,75 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process LEARNERRORS { - tag "$meta.id" - label 'process_me' + tag "$readmode" + label 'process_medium' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + container "ghcr.io/h3abionet/tada:dev" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + tuple val(readmode), file(reads) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here - path "versions.yml" , emit: versions + // path "versions.yml" , emit: versions + tuple val(readmode), file("errors.${readmode}.RDS"), emit: error_models + // path("errors.R[12].RDS"), emit: errorModelsPerSample + path("${readmode}.err.pdf"), emit: pdf when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + def prefix = task.ext.prefix + def derepreads = 100000 + // TODO: not even sure the below will work w/ nf-core style parameters, + // may need to go to a string or args-type modifications + def dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA' + // TODO: will need to if-else this when implementing alternative error models """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + dadaOpt <- ${dadaOpt} + + if (!is.na(dadaOpt)) { + setDadaOpt(dadaOpt) + cat("dada Options:\n",dadaOpt,"\n") + } + + # File parsing (these come from the process input channel) + filts <- list.files('.', pattern=paste0("${readmode}",".filtered.fastq.gz"), full.names = TRUE) + sample.names <- sapply(strsplit(basename(filts), "_"), `[`, 1) # Assumes filename = samplename_XXX.fastq.gz + set.seed(100) + + # Learn forward error rates + err <- learnErrors(filts, multithread=${task.cpus}, verbose=1) - cat <<-END_VERSIONS > versions.yml - "${task.process}": - learnerrors: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + # This is a rough correction for NovaSeq binning issues + # See https://github.com/h3abionet/TADA/issues/31, we'll likely + # add alternatives here soon + + if (as.logical("${params.quality_binning}") == TRUE ) { + # TODO: add alternatives for binning + print("Running binning correction") + errs <- t(apply(getErrors(err), 1, function(x) { x[x < x[40]] = x[40]; return(x)} )) + err\$err_out <- errs + } + + pdf(paste0("${readmode}",".err.pdf")) + plotErrors(err, nominalQ=TRUE) + dev.off() + + saveRDS(err, paste0("errors.","${readmode}",".RDS")) """ stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix // TODO nf-core: A stub section should mimic the execution of the original module as best as possible // Have a look at the following examples: // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - learnerrors: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + # TODO: make a proper stub """ } diff --git a/modules/local/mergetrimtables.nf b/modules/local/mergetrimtables.nf index d7037d8..4d255dd 100644 --- a/modules/local/mergetrimtables.nf +++ b/modules/local/mergetrimtables.nf @@ -1,91 +1,45 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process MERGETRIMTABLES { - tag "$meta.id" label 'process_low' - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + container "ghcr.io/h3abionet/tada:dev" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(bam) + path(trimData) output: + path("all.trimmed.csv"), emit: trimmed_report // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - tuple val(meta), path("*.bam"), emit: bam + // tuple val(meta), path("*.bam"), emit: bam // TODO nf-core: List additional required output channels/values here - path "versions.yml" , emit: versions + // path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + def prefix = task.ext.prefix """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam + #!/usr/bin/env Rscript + + # Main purpose of this script is to merge all trimming data into one table - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mergetrimtables: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + trimmedFiles <- list.files(path = '.', pattern = '*.trimmed.txt') + sample.names <- sub('.trimmed.txt', '', trimmedFiles) + trimmed <- do.call("rbind", lapply(trimmedFiles, function (x) as.data.frame(read.csv(x)))) + colnames(trimmed)[1] <- "Sequence" + trimmed\$SampleID <- sample.names + write.csv(trimmed, "all.trimmed.csv", row.names = FALSE) """ stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix // TODO nf-core: A stub section should mimic the execution of the original module as best as possible // Have a look at the following examples: // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mergetrimtables: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + touch "all.trimmed.csv" """ } diff --git a/modules/local/plotqualityprofile.nf b/modules/local/plotqualityprofile.nf index 8d05d2e..c0e8f10 100644 --- a/modules/local/plotqualityprofile.nf +++ b/modules/local/plotqualityprofile.nf @@ -17,7 +17,7 @@ process PLOTQUALITYPROFILE { tag "$meta.id" - label 'PLOTQUALITYPROFILE' + label 'process_low' // TODO: pin to a versioned docker instance container "ghcr.io/h3abionet/tada:dev" diff --git a/modules/local/pooledseqtable.nf b/modules/local/pooledseqtable.nf index e6820b2..6185e22 100644 --- a/modules/local/pooledseqtable.nf +++ b/modules/local/pooledseqtable.nf @@ -1,47 +1,18 @@ -// TODO nf-core: If in doubt look at other nf-core/modules to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/modules/nf-core/ -// You can also ask for help via your pull request or on the #modules channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A module file SHOULD only define input and output files as command-line parameters. -// All other parameters MUST be provided using the "task.ext" directive, see here: -// https://www.nextflow.io/docs/latest/process.html#ext -// where "task.ext" is a string. -// Any parameters that need to be evaluated in the context of a particular sample -// e.g. single-end/paired-end data MUST also be defined and evaluated appropriately. -// TODO nf-core: Software that can be piped together SHOULD be added to separate module files -// unless there is a run-time, storage advantage in implementing in this way -// e.g. it's ok to have a single module for bwa to output BAM instead of SAM: -// bwa mem | samtools view -B -T ref.fasta -// TODO nf-core: Optional inputs are not currently supported by Nextflow. However, using an empty -// list (`[]`) instead of a file can be used to work around this issue. - process POOLEDSEQTABLE { - tag '$bam' - label 'process_m' - - // TODO nf-core: List required Conda package(s). - // Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - // For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - // TODO nf-core: See section in main README for further information regarding finding and adding container addresses to the section below. - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - 'biocontainers/YOUR-TOOL-HERE' }" + label 'process_medium' + + container "ghcr.io/h3abionet/tada:dev" input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/modules/nf-core/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - path bam + path(dds) + path(filts) output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - path "*.bam", emit: bam + path("seqtab.filtered.RDS"), emit: seqtable + path("all.merged.RDS"), optional: true, emit: merged + path("seqtab.full.RDS"), emit: seqtabQC// we keep this for comparison and possible QC // TODO nf-core: List additional required output channels/values here - path "versions.yml" , emit: versions + // path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -49,41 +20,195 @@ process POOLEDSEQTABLE { script: def args = task.ext.args ?: '' - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/modules/nf-core/homer/annotatepeaks/main.nf - // Each software used MUST provide the software name and version number in the YAML version file (versions.yml) - // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "task.ext.args" directive - // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter - // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please replace the example samtools command below with your module's command - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) """ - samtools \\ - sort \\ - $args \\ - -@ $task.cpus \\ - $bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pooledseqtable: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + #!/usr/bin/env Rscript + suppressPackageStartupMessages(library(dada2)) + + rescuePairs <- as.logical("${params.rescue_unmerged}") + + mergePairsRescue <- function(dadaF, derepF, dadaR, derepR, + minOverlap = 12, maxMismatch=0, returnRejects=FALSE, + propagateCol=character(0), justConcatenate=FALSE, + trimOverhang=FALSE, verbose=FALSE, rescueUnmerged=FALSE, ...) { + if(is(dadaF, "dada")) dadaF <- list(dadaF) + if(is(dadaR, "dada")) dadaR <- list(dadaR) + if(is(derepF, "derep")) derepF <- list(derepF) + else if(is(derepF, "character") && length(derepF)==1 && dir.exists(derepF)) derepF <- parseFastqDirectory(derepF) + if(is(derepR, "derep")) derepR <- list(derepR) + else if(is(derepR, "character") && length(derepR)==1 && dir.exists(derepR)) derepR <- parseFastqDirectory(derepR) + + if( !(is.list.of(dadaF, "dada") && is.list.of(dadaR, "dada")) ) { + stop("dadaF and dadaR must be provided as dada-class objects or lists of dada-class objects.") + } + if( !( (is.list.of(derepF, "derep") || is(derepF, "character")) && + (is.list.of(derepR, "derep") || is(derepR, "character")) )) { + stop("derepF and derepR must be provided as derep-class objects or as character vectors of filenames.") + } + nrecs <- c(length(dadaF), length(derepF), length(dadaR), length(derepR)) + if(length(unique(nrecs))>1) stop("The dadaF/derepF/dadaR/derepR arguments must be the same length.") + + rval <- lapply(seq_along(dadaF), function (i) { + mapF <- getDerep(derepF[[i]])\$map + mapR <- getDerep(derepR[[i]])\$map + if(!(is.integer(mapF) && is.integer(mapR))) stop("Incorrect format of \$map in derep-class arguments.") + # if(any(is.na(rF)) || any(is.na(rR))) stop("Non-corresponding maps and dada-outputs.") + if(!(length(mapF) == length(mapR) && + max(mapF, na.rm=TRUE) == length(dadaF[[i]]\$map) && + max(mapR, na.rm=TRUE) == length(dadaR[[i]]\$map))) { + stop("Non-corresponding derep-class and dada-class objects.") + } + rF <- dadaF[[i]]\$map[mapF] + rR <- dadaR[[i]]\$map[mapR] + + pairdf <- data.frame(sequence = "", abundance=0, forward=rF, reverse=rR) + ups <- unique(pairdf) # The unique forward/reverse pairs of denoised sequences + keep <- !is.na(ups\$forward) & !is.na(ups\$reverse) + ups <- ups[keep, ] + if (nrow(ups)==0) { + outnames <- c("sequence", "abundance", "forward", "reverse", + "nmatch", "nmismatch", "nindel", "prefer", "accept") + ups <- data.frame(matrix(ncol = length(outnames), nrow = 0)) + names(ups) <- outnames + if(verbose) { + message("No paired-reads (in ZERO unique pairings) successfully merged out of ", nrow(pairdf), " pairings) input.") + } + return(ups) + } else { + Funqseq <- unname(as.character(dadaF[[i]]\$clustering\$sequence[ups\$forward])) + Runqseq <- rc(unname(as.character(dadaR[[i]]\$clustering\$sequence[ups\$reverse]))) + if (justConcatenate == TRUE) { + # Simply concatenate the sequences together + ups\$sequence <- mapply(function(x,y) paste0(x,"NNNNNNNNNN", y), + Funqseq, Runqseq, SIMPLIFY=FALSE); + ups\$nmatch <- 0 + ups\$nmismatch <- 0 + ups\$nindel <- 0 + ups\$prefer <- NA + ups\$accept <- TRUE + } else { + # Align forward and reverse reads. + # Use unbanded N-W align to compare forward/reverse + # Adjusting align params to prioritize zero-mismatch merges + tmp <- getDadaOpt(c("MATCH", "MISMATCH", "GAP_PENALTY")) + if(maxMismatch==0) { + setDadaOpt(MATCH=1L, MISMATCH=-64L, GAP_PENALTY=-64L) + } else { + setDadaOpt(MATCH=1L, MISMATCH=-8L, GAP_PENALTY=-8L) + } + alvecs <- mapply(function(x,y) nwalign(x,y,band=-1,...), Funqseq, Runqseq, SIMPLIFY=FALSE) + setDadaOpt(tmp) + outs <- t(sapply(alvecs, function(x) C_eval_pair(x[1], x[2]))) + ups\$nmatch <- outs[,1] + ups\$nmismatch <- outs[,2] + ups\$nindel <- outs[,3] + ups\$prefer <- 1 + (dadaR[[i]]\$clustering\$n0[ups\$reverse] > dadaF[[i]]\$clustering\$n0[ups\$forward]) + ups\$accept <- (ups\$nmatch >= minOverlap) & ((ups\$nmismatch + ups\$nindel) <= maxMismatch) + # Make the sequence + ups\$sequence <- mapply(C_pair_consensus, sapply(alvecs,`[`,1), sapply(alvecs,`[`,2), ups\$prefer, trimOverhang); + # Additional param to indicate whether 1:forward or 2:reverse takes precedence + # Must also strip out any indels in the return + # This function is only used here. + } + + # Add abundance and sequence to the output data.frame + tab <- table(pairdf\$forward, pairdf\$reverse) + ups\$abundance <- tab[cbind(ups\$forward, ups\$reverse)] + if (rescueUnmerged == TRUE) { + rescue <- which(!ups\$accept) + ups\$sequence[rescue] <- mapply(function(x,y) paste0(x,"NNNNNNNNNN", y), + Funqseq[rescue], Runqseq[rescue], SIMPLIFY=FALSE); + } else { + ups\$sequence[!ups\$accept] <- "" + } + # Add columns from forward/reverse clustering + propagateCol <- propagateCol[propagateCol %in% colnames(dadaF[[i]]\$clustering)] + for(col in propagateCol) { + ups[,paste0("F.",col)] <- dadaF[[i]]\$clustering[ups\$forward,col] + ups[,paste0("R.",col)] <- dadaR[[i]]\$clustering[ups\$reverse,col] + } + # Sort output by abundance and name + ups <- ups[order(ups\$abundance, decreasing=TRUE),] + rownames(ups) <- NULL + if(verbose) { + message(sum(ups\$abundance[ups\$accept]), " paired-reads (in ", sum(ups\$accept), " unique pairings) successfully merged out of ", sum(ups\$abundance), " (in ", nrow(ups), " pairings) input.") + } + if(!returnRejects) { ups <- ups[ups\$accept,] } + + if(any(duplicated(ups\$sequence))) { + message("Duplicate sequences in merged output.") + } + return(ups) + } + }) + if(!is.null(names(dadaF))) names(rval) <- names(dadaF) + if(length(rval) == 1) rval <- rval[[1]] + + return(rval) + } + + # this is to ensure this acts as if in the dada2 package (so uses any functions definted within) + environment(mergePairsRescue) <- asNamespace('dada2') + + filtFs <- list.files('.', pattern="R1.filtered.fastq.gz", full.names = TRUE) + filtRs <- list.files('.', pattern="R2.filtered.fastq.gz", full.names = TRUE) + + # read in denoised reads for both + ddFs <- readRDS("all.dd.R1.RDS") + + if (file.exists("all.dd.R2.RDS")) { + + ddRs <- readRDS("all.dd.R2.RDS") + + mergers <- if(rescuePairs) { + mergePairsRescue(ddFs, filtFs, ddRs, filtRs, + returnRejects = TRUE, + minOverlap = ${params.min_overlap}, + maxMismatch = ${params.max_mismatch}, + trimOverhang = as.logical("${params.trim_overhang}"), + justConcatenate = as.logical("${params.just_concatenate}"), + rescueUnmerged=rescuePairs, + verbose = TRUE + ) + } else { + mergePairs(ddFs, filtFs, ddRs, filtRs, + returnRejects = TRUE, + minOverlap = ${params.min_overlap}, + maxMismatch = ${params.max_mismatch}, + trimOverhang = as.logical("${params.trim_overhang}"), + justConcatenate = as.logical("${params.just_concatenate}"), + verbose = TRUE + ) + } + + # TODO: make this a single item list with ID as the name, this is lost + # further on + saveRDS(mergers, "all.merged.RDS") + + # go ahead and make seqtable + seqtab <- makeSequenceTable(mergers) + } else { + # No merging, just make a seqtable off R1 only + seqtab <- makeSequenceTable(ddFs) + } + + saveRDS(seqtab, "seqtab.full.RDS") + + # this is an optional filtering step to remove *merged* sequences based on + # min/max length criteria + if (${params.min_asv_len} > 0) { + seqtab <- seqtab[,nchar(colnames(seqtab)) >= ${params.min_asv_len}] + } + + if (${params.max_asv_len} > 0) { + seqtab <- seqtab[,nchar(colnames(seqtab)) <= ${params.max_asv_len}] + } + + saveRDS(seqtab, "seqtab.filtered.RDS") """ stub: def args = task.ext.args ?: '' - - // TODO nf-core: A stub section should mimic the execution of the original module as best as possible - // Have a look at the following examples: - // Simple example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bcftools/annotate/main.nf#L47-L63 - // Complex example: https://github.com/nf-core/modules/blob/818474a292b4860ae8ff88e149fbcda68814114d/modules/nf-core/bedtools/split/main.nf#L38-L54 """ - touch ${prefix}.bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pooledseqtable: \$(samtools --version |& sed '1!d ; s/samtools //') - END_VERSIONS + # add some real stuff here """ } diff --git a/nextflow.config b/nextflow.config index b86c439..419f5a9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,11 +60,11 @@ params { max_read_len = 5000 // default, this can be coersed in R using as.numeric min_read_len = 50 // default // I think we can make these bool 'false' as above with R coersion (either through as.logical or using optparse in a Rscript) - rmPhiX = false + rmPhiX = false // Error model - qualityBinning = false // false, set to true if using binned qualities (NovaSeq) - errorModel = 'illumina' // NYI, thinking about best way to implement this + quality_binning = false // false, set to true if using binned qualities (NovaSeq) + error_model = 'illumina' // NYI, thinking about best way to implement this // Merging // paired_type = "overlapping" // allowed: 'overlapping' (paired reads overlap), 'separate' (paired reads are non-overlapping), or 'mix' (variable length) @@ -74,7 +74,7 @@ params { just_concatenate = false // TODO: test using false instead of string // CF: this is for rescuing unmerged ITS, should // be off unless really needed, and even then it's questionable. But it is requested sometimes - // rescueUnmerged = false + rescue_unmerged = false dadaOpt = [] max_asv_len = 0 // Only run if set > 1 min_asv_len = 0 // Only run if set > 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index e097502..75c9487 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -182,7 +185,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -330,7 +340,7 @@ "default": 0 }, "max_read_len": { - "type": "string", + "type": "integer", "default": 5000 }, "min_read_len": { @@ -340,13 +350,6 @@ "rmPhiX": { "type": "boolean" }, - "qualityBinning": { - "type": "boolean" - }, - "errorModel": { - "type": "string", - "default": "illumina" - }, "min_overlap": { "type": "integer", "default": 20 @@ -424,6 +427,16 @@ "id_type": { "type": "string", "default": "md5" + }, + "quality_binning": { + "type": "boolean" + }, + "error_model": { + "type": "string", + "default": "illumina" + }, + "rescue_unmerged": { + "type": "boolean" } } -} +} \ No newline at end of file diff --git a/workflows/tada.nf b/workflows/tada.nf index 0f8364c..771a166 100644 --- a/workflows/tada.nf +++ b/workflows/tada.nf @@ -11,6 +11,10 @@ include { PLOTQUALITYPROFILE } from '../modules/local/plotqualityprofile' // should be moved to a subworkflow // include { FILTERANDTRIM } from '../subworkflows/local/' include { FILTERANDTRIM } from '../modules/local/filterandtrim' +include { MERGETRIMTABLES } from '../modules/local/mergetrimtables' +include { LEARNERRORS } from '../modules/local/learnerrors' +include { DADAINFER } from '../modules/local/dadainfer' +include { POOLEDSEQTABLE } from '../modules/local/pooledseqtable' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -73,6 +77,7 @@ workflow TADA { FASTQC ( ch_samplesheet ) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) @@ -81,12 +86,6 @@ workflow TADA { PLOTQUALITYPROFILE ( ch_samplesheet ) - - FILTERANDTRIM ( - ch_samplesheet - ) - - // ch_multiqc_files = ch_multiqc_files.mix(PLOTQUALITYPROFILE.out.zip.collect{it[1]}) // ch_versions = ch_versions.mix(PLOTQUALITYPROFILE.out.versions.first()) @@ -98,13 +97,203 @@ workflow TADA { // DADA2 filterAndTrim // Alternative est error filtering - // Subworkflows-Denoising: + FILTERANDTRIM ( + ch_samplesheet + ) + + ch_trimmed_reads = FILTERANDTRIM.out.trimmed + ch_reports = FILTERANDTRIM.out.trimmed_report.collect() + + + MERGETRIMTABLES( + ch_reports + ) + + // Channel setup + + // We need to group data depending on which downstream steps are needed. There + // are two combinations possible + + // 1. The immediate downstream QC steps can use the meta info and the read pairs. + // Instead of doing handstands reusing the two channels above, we emit channels + // with the reads paired if needed. + + // 2. LearnErrors and the pooled denoising branch requires all R1 and all R2, but + // the two groups can be processed in parallel. So we set up the channels with + // this in mind. No sample ID info is really needed. + ch_trimmed_infer = FILTERANDTRIM.out.trimmed_R1 + // .concat(filteredReadsR2.ifEmpty([])) + .map { [ 'R1', it[1]] } + .concat(FILTERANDTRIM.out.trimmed_R2.map {['R2', it[1]] } ) + .groupTuple(sort: true) + + // Subworkflows-Denoising: // DADA2 // Pooled // Per-sample // Denoising (proposed alt): VSEARCH/Deblur // Per-sample + // TODO: This can go into a DADA2-specific subworkflow + LEARNERRORS ( + ch_trimmed_infer + ) + + ch_infer = LEARNERRORS.out.error_models.join(ch_trimmed_infer) + + // this is always in pooled mode at the moment, should be adjusted + DADAINFER( + ch_infer + ) + + ch_trimmed = ch_trimmed_infer + .map { it[1] } + .flatten() + .collect() + + POOLEDSEQTABLE( + DADAINFER.out.inferred.collect(), + ch_trimmed + ) + // if (params.pool == "T" || params.pool == 'pseudo') { + + // process DadaInfer { + // tag { "DadaInfer:${readmode}" } + // publishDir "${params.outdir}/dada2-Derep-Pooled", mode: "copy", overwrite: true + + // input: + // // DADA2 step runs on all R1 and/or on all R2 + // tuple val(readmode), file(err), file(reads) from errorModelsPooled + // .join(ReadsInfer) + + // output: + // // Note that the mode ('merged', 'R1', 'R2') can now potentially allow SE read analysis + // file("all.dd.${readmode}.RDS") into dadaMerge,dadaToReadTracking + + // when: + // params.precheck == false + + // script: + // dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA' + // template "DadaPooled.R" + // } + + // // This one is a little tricky. We can't know a priori how many instances of reads (R1 and R2) + // // are present outside the process, but we can determine this internally within the process + // // when we collect all of them. + // // So, here we check the size of the collected channel containing the denoised models; if + // // there are two then this is a paired-end run, otherwise it's single-end. Logic is in the R script + + // process PooledSeqTable { + // tag { "PooledSeqTable:${readmode}" } + // publishDir "${params.outdir}/dada2-OriginalSeqTable", mode: "copy", overwrite: true + + // input: + // // we don't care about the mode here, so only get the dds (dada-inferred) RDS files + // file(dds) from dadaMerge.collect() + // // we don't care about the mode here, we only grab the reads + // file(filts) from ReadsMerge + // .map { it[1] } + // .flatten() + // .collect() + + // output: + // tuple val(readmode), file("seqtab.${readmode}.RDS") into seqTable,rawSeqTableToRename + // file "all.merged.RDS" optional true into mergerTracking,mergerQC + // file "seqtab.original.*.RDS" into seqtabQC// we keep this for comparison and possible QC + + // when: + // params.precheck == false + + // script: + // // We could switch this to 'paired' vs 'single-end' as well + // readmode = dds.size() == 2 ? 'merged' : 'R1' + // template "SeqTables.R" + // } + // } else { + + // process PerSampleInferDerepAndMerge { + // tag { "PerSampleInferDerepAndMerge:${meta.id}" } + // publishDir "${params.outdir}/dada2-Derep-Single/Per-Sample", mode: "copy", overwrite: true + + // input: + // tuple val(meta), file(reads) from readsToPerSample + // file(errs) from errorModelsPerSample.collect() + + // output: + // file("${meta.id}.{R1,merged}.RDS") into combinedReads + // tuple val(meta), file("${meta.id}.dd.R{1,2}.RDS") into perSampleDadaToMerge + // val(readmode) into modeSeqTable + + // when: + // params.precheck == false + + // script: + // dadaOpt = !params.dadaOpt.isEmpty() ? "'${params.dadaOpt.collect{k,v->"$k=$v"}.join(", ")}'" : 'NA' + // readmode = errs.size() == 2 ? 'merged' : 'R1' + // template "PerSampleDadaInfer.R" + // } + + // process MergeDadaRDS { + // tag { "mergeDadaRDS" } + // publishDir "${params.outdir}/dada2-Derep-Single", mode: "copy", overwrite: true + + // input: + // file(dds) from perSampleDadaToMerge + // .map { it[1] } + // .flatten() + // .collect() + + // output: + // file("all.dd.R{1,2}.RDS") into dadaToReadTracking + + // when: + // params.precheck == false + + // script: + // template "MergePerSampleDada.R" + // } + + // process SequenceTable { + // tag { "SequenceTable:${readmode}" } + // publishDir "${params.outdir}/dada2-Derep-Single", mode: "copy", overwrite: true + + // input: + // file(mr) from combinedReads.collect() + // val(readmode) from modeSeqTable.first() + + // output: + // tuple val(readmode), file("seqtab.${readmode}.RDS") into seqTable,rawSeqTableToRename + // file "all.merged.RDS" optional true into mergerTracking,mergerQC + // file "seqtab.original.${readmode}.RDS" into seqtabQC // we keep this for comparison and possible QC + + // when: + // params.precheck == false + + // script: + // template "PerSampleSeqTable.R" + // } + // } + + // } else if (params.seqTables) { // TODO maybe we should check the channel here + // process MergeSeqTables { + // tag { "MergeSeqTables" } + // publishDir "${params.outdir}/dada2-MergedSeqTable", mode: 'copy' + + // input: + // file(st) from dada2SeqTabs + // .map { it[1] } + // .collect() + + // output: + // tuple val("merged"), file("seqtab.merged.RDS") into seqTable, rawSeqTableToRename + + // script: + // template "MergeSeqTables.R" + // } + // Channel.empty().into { SEChimera;RawSEChimeraToRename;trimmedReadTracking;dadaToReadTracking;mergerTracking;mergerQC } + // } + // Subworkflows-Taxonomic assignment (optional) // Subworkflows-Alignment + Phylogenetic Tree