diff --git a/CHANGELOG.md b/CHANGELOG.md index a587782b..bdea0e5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` +- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (by @maxibor) + ### `Dependencies` ### `Deprecated` diff --git a/conf/modules.config b/conf/modules.config index c87d71d9..d2ebe7b2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -377,8 +377,8 @@ process { } withName: 'CHECKM_LINEAGEWF' { - tag = { "${meta.assembler}-${meta.binner}-${meta.id}" } - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" } + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, mode: params.publish_dir_mode, @@ -387,7 +387,7 @@ process { } withName: 'CHECKM_QA' { - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" } ext.args = "-o 2 --tab_table" publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, @@ -458,6 +458,7 @@ process { withName: GTDBTK_CLASSIFYWF { ext.args = "--extension fa" + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } publishDir = [ path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index 1061870f..c8eeee14 100644 --- a/docs/output.md +++ b/docs/output.md @@ -476,6 +476,7 @@ For each bin or refined bin the median sequencing depth is computed based on the - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) - `GenomeBinning/QC/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition. - `quast_summary.tsv`: QUAST output for all bins summarized @@ -531,9 +532,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place Output files - `GenomeBinning/QC/CheckM/` - - `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. - - `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). - - `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). @@ -581,14 +582,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req Output files - `Taxonomy/CAT/[assembler]/[binner]/` - - `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names - - `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names - `Taxonomy/CAT/[assembler]/[binner]/raw/` - - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format - - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format - - `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig - - `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins - - `[assembler]-[binner]-[sample/group].log`: Log files + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files @@ -609,14 +610,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally Output files - `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/` - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html). - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. - - `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files. - - `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. -- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). + - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)). + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. +- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). diff --git a/modules.json b/modules.json index 73a43c4d..e9162243 100644 --- a/modules.json +++ b/modules.json @@ -118,7 +118,7 @@ }, "gtdbtk/classifywf": { "branch": "master", - "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df", + "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5", "installed_by": ["modules"] }, "gunc/downloaddb": { diff --git a/modules/local/cat.nf b/modules/local/cat.nf index 48af75c0..bda355c6 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -1,39 +1,42 @@ process CAT { - tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}" + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}" - conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + conda "bioconda::cat=5.2.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : - 'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'biocontainers/cat:5.2.3--hdfd78af_1' }" input: tuple val(meta), path("bins/*") tuple val(db_name), path("database/*"), path("taxonomy/*") output: - path("*.names.txt.gz") , emit: tax_classification - path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca - path("raw/*.predicted_proteins.faa.gz"), emit: faa - path("raw/*.predicted_proteins.gff.gz"), emit: gff - path("raw/*.log") , emit: log - path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids - path "versions.yml" , emit: versions + path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification + path("*.bin2classification.names.txt.gz") , emit: tax_classification_names + path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca + path("raw/*.predicted_proteins.faa.gz") , emit: faa + path("raw/*.predicted_proteins.gff.gz") , emit: gff + path("raw/*.log") , emit: log + path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids + path "versions.yml" , emit: versions script: def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : "" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" """ - CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} + CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing + CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} + CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} mkdir raw mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/ - gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \ - "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \ - "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" + gzip "raw/${prefix}.ORF2LCA.txt" \ + "raw/${prefix}.concatenated.predicted_proteins.faa" \ + "raw/${prefix}.concatenated.predicted_proteins.gff" \ + "raw/${prefix}.bin2classification.txt" \ + "${prefix}.ORF2LCA.names.txt" \ + "${prefix}.bin2classification.names.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf index b8015ad5..e8ae58e7 100644 --- a/modules/local/quast_bins.nf +++ b/modules/local/quast_bins.nf @@ -1,5 +1,5 @@ process QUAST_BINS { - tag "${meta.assembler}-${meta.binner}-${meta.id}" + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" conda "bioconda::quast=5.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -15,15 +15,16 @@ process QUAST_BINS { path "versions.yml" , emit: versions script: + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" """ BINS=\$(echo \"$bins\" | sed 's/[][]//g') IFS=', ' read -r -a bins <<< \"\$BINS\" for bin in \"\${bins[@]}\"; do metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}" - if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then - cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" + if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then + cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv" else - tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" + tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv" fi done diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 0b6b76cc..00da4459 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -1,28 +1,29 @@ process GTDBTK_CLASSIFYWF { - tag "${meta.assembler}-${meta.id}" + tag "${prefix}" label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::gtdbtk=2.1.1" + conda "bioconda::gtdbtk=2.3.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gtdbtk:2.1.1--pyhdfd78af_1' : - 'biocontainers/gtdbtk:2.1.1--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : + 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" input: tuple val(meta), path("bins/*") tuple val(db_name), path("database/*") + path(mash_db) output: - path "gtdbtk.${meta.assembler}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.id}.*.classify.tree.gz" , emit: tree - path "gtdbtk.${meta.assembler}-${meta.id}.*.markers_summary.tsv", emit: markers - path "gtdbtk.${meta.assembler}-${meta.id}.*.msa.fasta.gz" , emit: msa - path "gtdbtk.${meta.assembler}-${meta.id}.*.user_msa.fasta" , emit: user_msa - path "gtdbtk.${meta.assembler}-${meta.id}.*.filtered.tsv" , emit: filtered - path "gtdbtk.${meta.assembler}-${meta.id}.log" , emit: log - path "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv" , emit: failed - path "versions.yml" , emit: versions + tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary + tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv") , emit: markers, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true + tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed, optional: true + tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log + tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings + path("versions.yml") , emit: versions when: task.ext.when == null || task.ext.when @@ -30,6 +31,8 @@ process GTDBTK_CLASSIFYWF { script: def args = task.ext.args ?: '' def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" + def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" + prefix = task.ext.prefix ?: "${meta.id}" """ export GTDBTK_DATA_PATH="\${PWD}/database" @@ -40,17 +43,25 @@ process GTDBTK_CLASSIFYWF { gtdbtk classify_wf \\ $args \\ --genome_dir bins \\ - --prefix "gtdbtk.${meta.assembler}-${meta.id}" \\ + --prefix "gtdbtk.${prefix}" \\ --out_dir "\${PWD}" \\ --cpus $task.cpus \\ - --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ + $mash_mode \\ $pplacer_scratch \\ --min_perc_aa $params.gtdbtk_min_perc_aa \\ --min_af $params.gtdbtk_min_af - gzip "gtdbtk.${meta.assembler}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.id}".*.msa.fasta - mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.id}.log" - mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" + mv classify/* . + + mv identify/* . + + mv align/* .\ + + mv gtdbtk.log "gtdbtk.${prefix}.log" + + mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" + + find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -59,18 +70,18 @@ process GTDBTK_CLASSIFYWF { """ stub: - def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - + def VERSION = '2.3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + prefix = task.ext.prefix ?: "${meta.id}" """ - touch gtdbtk.${meta.assembler}-${meta.id}.stub.summary.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.stub.classify.tree.gz - touch gtdbtk.${meta.assembler}-${meta.id}.stub.markers_summary.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.stub.msa.fasta.gz - touch gtdbtk.${meta.assembler}-${meta.id}.stub.user_msa.fasta - touch gtdbtk.${meta.assembler}-${meta.id}.stub.filtered.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.log - touch gtdbtk.${meta.assembler}-${meta.id}.warnings.log - touch gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv + touch gtdbtk.${prefix}.stub.summary.tsv + touch gtdbtk.${prefix}.stub.classify.tree.gz + touch gtdbtk.${prefix}.stub.markers_summary.tsv + touch gtdbtk.${prefix}.stub.msa.fasta.gz + touch gtdbtk.${prefix}.stub.user_msa.fasta.gz + touch gtdbtk.${prefix}.stub.filtered.tsv + touch gtdbtk.${prefix}.log + touch gtdbtk.${prefix}.warnings.log + touch gtdbtk.${prefix}.failed_genomes.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index 4e7ec5f1..4319bc74 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -31,6 +31,10 @@ input: type: file description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) pattern: "*" + - mash_db: + type: file + description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) + pattern: "*.msh" output: - meta: diff --git a/nextflow.config b/nextflow.config index 89eb7739..4c7b26a9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -88,6 +88,7 @@ params { save_cat_db = false skip_gtdbtk = false gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" + gtdb_mash = null gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 gtdbtk_min_perc_aa = 10 @@ -324,7 +325,7 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { - id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-validation@0.3.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index 837b4e20..5488af90 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -525,6 +525,10 @@ "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" }, + "gtdb_mash": { + "type": "string", + "description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step" + }, "gtdbtk_min_completeness": { "type": "number", "default": 50.0, @@ -808,7 +812,7 @@ "default": "raw_bins_only", "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.", - "enum": ["raw_bins_only", "refined_bins_only"] + "enum": ["raw_bins_only", "refined_bins_only", "both"] }, "run_gunc": { "type": "boolean", diff --git a/subworkflows/local/binning_refinement.nf b/subworkflows/local/binning_refinement.nf index eea8c76a..360bffaa 100644 --- a/subworkflows/local/binning_refinement.nf +++ b/subworkflows/local/binning_refinement.nf @@ -25,7 +25,7 @@ workflow BINNING_REFINEMENT { // everything here is either unclassified or a prokaryote ch_bins = bins .map { meta, bins -> - def meta_new = meta - meta.subMap('domain') + def meta_new = meta - meta.subMap(['domain','refinement']) [meta_new, bins] } .groupTuple() @@ -88,7 +88,7 @@ workflow BINNING_REFINEMENT { .map { meta, bins -> def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' - def meta_new = meta + [domain: domain_class] + def meta_new = meta + [refinement: 'dastool_refined', domain: domain_class] [ meta_new, bins ] } @@ -96,14 +96,21 @@ workflow BINNING_REFINEMENT { .map { meta, bins -> def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' - def meta_new = meta + [binner: 'DASTool', domain: domain_class] + def meta_new = meta + [refinement: 'dastool_refined', binner: 'DASTool', domain: domain_class] [ meta_new, bins ] } RENAME_POSTDASTOOL ( ch_input_for_renamedastool ) + refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins + .map { + meta, bins -> + def meta_new = meta + [refinement: 'dastool_refined_unbinned'] + [meta_new, bins] + } + emit: refined_bins = ch_dastool_bins_newmeta - refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins + refined_unbins = refined_unbins versions = ch_versions } diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index ce92efd5..a5c3be8d 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -65,6 +65,10 @@ workflow BUSCO_QC { BUSCO_SAVE_DOWNLOAD ( ch_downloads ) } + busco_summary_domain = BUSCO.out.summary_domain.collect() + busco_summary_specific = BUSCO.out.summary_specific.collect() + busco_failed_bin = BUSCO.out.failed_bin.collect() + BUSCO_SUMMARY ( BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 81c93c6f..012899ad 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -19,21 +19,32 @@ workflow DEPTHS { main: ch_versions = Channel.empty() + + depths.dump(tag: 'depths', pretty: true) + // Compute bin depths for different samples (according to `binning_map_mode`) - // Create a new meta joining key first, but copy meta so that + // Create a new meta combine key first, but copy meta so that // we retain the information about binners and domain classification ch_depth_input = bins_unbins - .map { meta, bins -> - def meta_join = meta - meta.subMap('binner','domain') - [ meta_join, meta, bins ] - } - .combine( depths, by: 0 ) - .map { meta_join, meta, bins, contig_depths_file -> - def meta_new = meta - meta.subMap('domain') - [ meta_new, bins, contig_depths_file ] + .map { + meta, bins -> + def meta_combine = meta - meta.subMap('binner','domain','refinement') + [meta_combine, meta, bins] } + .groupTuple() + .combine(depths, by: 0) .transpose() + .map { + meta_combine, meta, bins, depth -> + def meta_new = meta - meta.subMap('domain','refinement') + [meta_new, bins, depth] + } .groupTuple(by: [0,2]) + .map { + meta, bins, depth -> + [meta, bins.unique().flatten(), depth] + } + MAG_DEPTHS ( ch_depth_input ) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 7586b142..2f110a43 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -12,6 +12,7 @@ workflow GTDBTK { busco_summary // channel: path checkm_summary // channel: path gtdb // channel: path + gtdb_mash // channel: path main: // Filter bins: classify only medium & high quality MAGs @@ -46,6 +47,7 @@ workflow GTDBTK { } } + // Filter bins based on collected metrics: completeness, contamination ch_filtered_bins = bins .transpose() @@ -76,14 +78,17 @@ workflow GTDBTK { GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), - ch_db_for_gtdbtk + ch_db_for_gtdbtk, + gtdb_mash ) GTDBTK_SUMMARY ( ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]), GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]), - GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]), - GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([]) + [], + // GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]), + [] + // GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([]) ) emit: diff --git a/workflows/mag.nf b/workflows/mag.nf index 11449c8c..d5aff32a 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -207,6 +207,7 @@ gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db if (gtdb) { gtdb = file( "${gtdb}", checkIfExists: true) + gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] } else { gtdb = [] } @@ -781,6 +782,20 @@ workflow MAG { * DAS Tool: binning refinement */ + ch_binning_results_bins = ch_binning_results_bins + .map { meta, bins -> + def meta_new = meta + [refinement:'unrefined'] + [meta_new , bins] + } + + ch_binning_results_unbins = ch_binning_results_unbins + .map { meta, bins -> + def meta_new = meta + [refinement:'unrefined_unbinned'] + [meta_new, bins] + } + + + // If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one if ( params.refine_bins_dastool ) { ch_prokarya_bins_dastool = ch_binning_results_bins @@ -801,7 +816,13 @@ workflow MAG { } BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool ) - ch_refined_bins = ch_eukarya_bins_dastool.mix(BINNING_REFINEMENT.out.refined_bins) + // ch_refined_bins = ch_eukarya_bins_dastool + // .map{ meta, bins -> + // def meta_new = meta + [refinement: 'eukaryote_unrefined'] + // [meta_new, bins] + // }.mix( BINNING_REFINEMENT.out.refined_bins) + + ch_refined_bins = BINNING_REFINEMENT.out.refined_bins ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) @@ -813,10 +834,10 @@ workflow MAG { ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins) // TODO REACTIVATE ONCE PR #489 IS READY! // TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING - // } else if ( params.postbinning_input == 'both' ) { - // ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) - // ch_input_for_postbinning_bins = ch_all_bins - // ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) + } else if ( params.postbinning_input == 'both' ) { + ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) + ch_input_for_postbinning_bins = ch_all_bins + ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) } } else { ch_input_for_postbinning_bins = ch_binning_results_bins @@ -888,9 +909,9 @@ workflow MAG { ch_input_for_quast_bins = ch_input_for_postbinning_bins_unbins .groupTuple() .map { - meta, reads -> - def new_reads = reads.flatten() - [meta, new_reads] + meta, bins -> + def new_bins = bins.flatten() + [meta, new_bins] } QUAST_BINS ( ch_input_for_quast_bins ) @@ -915,7 +936,7 @@ workflow MAG { ch_cat_db ) CAT_SUMMARY( - CAT.out.tax_classification.collect() + CAT.out.tax_classification_names.collect() ) ch_versions = ch_versions.mix(CAT.out.versions.first()) ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions) @@ -938,7 +959,8 @@ workflow MAG { ch_gtdb_bins, ch_busco_summary, ch_checkm_summary, - gtdb + gtdb, + gtdb_mash ) ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) ch_gtdbtk_summary = GTDBTK.out.summary