diff --git a/modules.json b/modules.json index baef0602..f4c2c9e7 100644 --- a/modules.json +++ b/modules.json @@ -137,7 +137,7 @@ }, "krakenuniq/preloadedkrakenuniq": { "branch": "master", - "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", + "git_sha": "2512434a59d282cba0fbc53ddb81dd8f244dc428", "installed_by": ["modules"] }, "krona/ktimporttaxonomy": { diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf index d24f75d2..3a31fcb4 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -8,7 +8,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { 'biocontainers/krakenuniq:1.0.4--pl5321h6dccd9a_2' }" input: - tuple val(meta), path(sequences) + // We stage sequencing files in a sub-directory so we don't accidentally gzip them later. + tuple val(meta), path(sequences, name: 'sequences/*'), val(prefixes) val sequence_type path db val ram_chunk_size @@ -28,6 +29,7 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { script: assert sequence_type in ['fasta', 'fastq'] + sequences = sequences instanceof List ? sequences : [sequences] def args = task.ext.args ?: '' def args2 = task.ext.args ?: '' @@ -38,9 +40,20 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { unclassified_option = save_output_reads ? "--unclassified-out \"${unclassified}\"" : '' def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' - compress_reads_command = save_output_reads ? "find . -name '*.${sequence_type}' -print0 | xargs -0 -t -P ${task.cpus} -I % gzip --no-name %" : '' + compress_reads_command = save_output_reads ? "find . -maxdepth 0 -name '*.${sequence_type}' -print0 | xargs -0 -t -P ${task.cpus} -I % gzip --no-name %" : '' + def command_inputs_file = '.inputs.txt' + if (meta.single_end) { + assert sequences.size() == prefixes.size() + command_inputs = [sequences, prefixes].transpose().collect { seq, prefix -> "${seq}\t${prefix}" } + """ + # Store the batch of samples for later command input. + cat <<-END_INPUTS > ${command_inputs_file} + ${command_inputs.join('\n ')} + END_INPUTS + + # Preload the KrakenUniq database into memory. krakenuniq \\ $args \\ --db $db \\ @@ -48,15 +61,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { --preload-size $ram_chunk_size \\ --threads $task.cpus - strip_suffix() { - local result=\$1 - # Strip any file extensions. - echo "\${result%%.*}" - } - - printf "%s\\n" ${sequences} | while read FASTQ; do \\ - PREFIX="\$(strip_suffix "\${FASTQ}")" - + # Run the KrakenUniq classification on each sample in the batch. + while IFS='\t' read -r SEQ PREFIX; do krakenuniq \\ --db $db \\ --threads $task.cpus \\ @@ -65,8 +71,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $unclassified_option \\ $classified_option \\ $args2 \\ - "\${FASTQ}" - done + "\${SEQ}" + done < ${command_inputs_file} $compress_reads_command @@ -76,7 +82,16 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { END_VERSIONS """ } else { + assert sequences.size() / 2 == prefixes.size() + command_inputs = [sequences.collate(2), prefixes].transpose().collect { pair, prefix -> "${pair[0]}\t${pair[1]}\t${prefix}" } + """ + # Store the batch of samples for later command input. + cat <<-END_INPUTS > ${command_inputs_file} + ${command_inputs.join('\n ')} + END_INPUTS + + # Preload the KrakenUniq database into memory. krakenuniq \\ $args \\ --db $db \\ @@ -84,18 +99,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { --preload-size $ram_chunk_size \\ --threads $task.cpus - strip_suffix() { - local result - read result - # Strip any trailing dot or underscore. - result="\${result%_}" - echo "\${result%.}" - } - - printf "%s %s\\n" ${sequences} | while read FASTQ; do \\ - read -r -a FASTQ <<< "\${FASTQ}" - PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" - + # Run the KrakenUniq classification on each sample in the batch. + while IFS='\t' read -r FIRST_SEQ SECOND_SEQ PREFIX; do krakenuniq \\ --db $db \\ --threads $task.cpus \\ @@ -105,8 +110,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $classified_option \\ --paired \\ $args2 \\ - "\${FASTQ[@]}" - done + "\${FIRST_SEQ}" "\${SECOND_SEQ}" + done < ${command_inputs_file} $compress_reads_command @@ -119,6 +124,7 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { stub: assert sequence_type in ['fasta', 'fastq'] + sequences = sequences instanceof List ? sequences : [sequences] def args = task.ext.args ?: '' def args2 = task.ext.args ?: '' @@ -130,8 +136,19 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' compress_reads_command = save_output_reads ? "find . -name '*.${sequence_type}' -print0 | xargs -0 -t -P ${task.cpus} -I % gzip --no-name %" : '' + def command_inputs_file = '.inputs.txt' + if (meta.single_end) { + assert sequences.size() == prefixes.size() + command_inputs = [sequences, prefixes].transpose().collect { seq, prefix -> "${seq}\t${prefix}" } + """ + # Store the batch of samples for later command input. + cat <<-END_INPUTS > ${command_inputs_file} + ${command_inputs.join('\n ')} + END_INPUTS + + # Preload the KrakenUniq database into memory. echo krakenuniq \\ $args \\ --db $db \\ @@ -139,12 +156,6 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { --preload-size $ram_chunk_size \\ --threads $task.cpus - strip_suffix() { - local result=\$1 - # Strip any file extensions. - echo "\${result%%.*}" - } - create_file() { echo '<3 nf-core' > "\$1" } @@ -153,11 +164,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo '<3 nf-core' | gzip -n > "\$1" } - printf "%s\\n" ${sequences} | while read FASTQ; do \\ - echo "\${FASTQ}" - PREFIX="\$(strip_suffix "\${FASTQ}")" - echo "\${PREFIX}" - + # Run the KrakenUniq classification on each sample in the batch. + while IFS='\t' read -r SEQ PREFIX; do echo krakenuniq \\ --db $db \\ --threads $task.cpus \\ @@ -166,13 +174,13 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $unclassified_option \\ $classified_option \\ $args2 \\ - "\${FASTQ}" + "\${SEQ}" create_file "\${PREFIX}.krakenuniq.classified.txt" create_file "\${PREFIX}.krakenuniq.report.txt" create_gzip_file "\${PREFIX}.classified.${sequence_type}.gz" create_gzip_file "\${PREFIX}.unclassified.${sequence_type}.gz" - done + done < ${command_inputs_file} echo "$compress_reads_command" @@ -182,7 +190,16 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { END_VERSIONS """ } else { + assert sequences.size() / 2 == prefixes.size() + command_inputs = [sequences.collate(2), prefixes].transpose().collect { pair, prefix -> "${pair[0]}\t${pair[1]}\t${prefix}" } + """ + # Store the batch of samples for later command input. + cat <<-END_INPUTS > ${command_inputs_file} + ${command_inputs.join('\n ')} + END_INPUTS + + # Preload the KrakenUniq database into memory. echo krakenuniq \\ $args \\ --db $db \\ @@ -190,14 +207,6 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { --preload-size $ram_chunk_size \\ --threads $task.cpus - strip_suffix() { - local result - read result - # Strip any trailing dot or underscore. - result="\${result%_}" - echo "\${result%.}" - } - create_file() { echo '<3 nf-core' > "\$1" } @@ -206,12 +215,8 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { echo '<3 nf-core' | gzip -n > "\$1" } - printf "%s %s\\n" ${sequences} | while read FASTQ; do \\ - read -r -a FASTQ <<< "\${FASTQ}" - echo "\${FASTQ[@]}" - PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" - echo "\${PREFIX}" - + # Run the KrakenUniq classification on each sample in the batch. + while IFS='\t' read -r FIRST_SEQ SECOND_SEQ PREFIX; do echo krakenuniq \\ --db $db \\ --threads $task.cpus \\ @@ -221,13 +226,13 @@ process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { $classified_option \\ --paired \\ $args2 \\ - "\${FASTQ[@]}" + "\${FIRST_SEQ}" "\${SECOND_SEQ}" create_file "\${PREFIX}.krakenuniq.classified.txt" create_file "\${PREFIX}.krakenuniq.report.txt" create_gzip_file "\${PREFIX}.merged.classified.${sequence_type}.gz" create_gzip_file "\${PREFIX}.merged.unclassified.${sequence_type}.gz" - done + done < ${command_inputs_file} echo "$compress_reads_command" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml index bb6409a6..8e674504 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -8,77 +8,108 @@ keywords: - db tools: - "krakenuniq": - description: "Metagenomics classifier with unique k-mer counting for more specific results" + description: "Metagenomics classifier with unique k-mer counting for more specific + results" homepage: https://github.com/fbreitwieser/krakenuniq documentation: https://github.com/fbreitwieser/krakenuniq doi: 10.1186/s13059-018-1568-0 licence: ["MIT"] + identifier: biotools:KrakenUniq input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - sequences: - type: file - description: List of input files containing sequences. All of them must be either in FASTA or FASTQ format. - - sequence_type: - type: string - description: Format of all given sequencing files as literal string, either 'fasta' or 'fastq'. - pattern: "{fasta,fastq}" - - db: - type: directory - description: KrakenUniq database - - ram_chunk_size: - type: string - description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time - pattern: "*GB" - - save_output_reads: - type: boolean - description: | - Optionally, commands are added to save classified and unclassified reads - as FASTQ or FASTA files depending on the input format. When the input - is paired-end, the single output FASTQ contains merged reads. - - report_file: - type: boolean - description: Whether to generate a report of relative abundances. - - save_output: - type: boolean - description: Whether to save a file reporting the taxonomic classification of each input read. + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sequences: + type: file + description: List of input files containing sequences. All of them must be either + in FASTA or FASTQ format. + - prefixes: + type: string + description: > + List of sample identifiers or filename prefixes. Must correspond in order and + length to the 'sequences', or to the number of sequencing pairs. + - - sequence_type: + type: string + description: Format of all given sequencing files as literal string, either + 'fasta' or 'fastq'. + pattern: "{fasta,fastq}" + - - db: + type: directory + description: KrakenUniq database + - - ram_chunk_size: + type: string + description: Amount of maximum amount of RAM each chunk of database that should + be loaded at any one time + pattern: "*GB" + - - save_output_reads: + type: boolean + description: | + Optionally, commands are added to save classified and unclassified reads + as FASTQ or FASTA files depending on the input format. When the input + is paired-end, the single output FASTQ contains merged reads. + - - report_file: + type: boolean + description: Whether to generate a report of relative abundances. + - - save_output: + type: boolean + description: Whether to save a file reporting the taxonomic classification of + each input read. output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - classified_reads: - type: file - description: | - Reads classified as belonging to any of the taxa - in the KrakenUniq reference database. - pattern: "*.classified.{fastq,fasta}.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.classified.${sequence_type}.gz": + type: file + description: | + Reads classified as belonging to any of the taxa + in the KrakenUniq reference database. + pattern: "*.classified.{fastq,fasta}.gz" - unclassified_reads: - type: file - description: | - Reads not classified to any of the taxa - in the KrakenUniq reference database. - pattern: "*.unclassified.{fastq,fasta}.gz" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.unclassified.${sequence_type}.gz": + type: file + description: | + Reads not classified to any of the taxa + in the KrakenUniq reference database. + pattern: "*.unclassified.{fastq,fasta}.gz" - classified_assignment: - type: file - description: | - KrakenUniq output file indicating the taxonomic assignment of - each input read ## DOUBLE CHECK!! - pattern: "*.krakenuniq.classified.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.krakenuniq.classified.txt": + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + pattern: "*.krakenuniq.classified.txt" - report: - type: file - description: | - KrakenUniq report containing statistics about classified - and unclassified reads. - pattern: "*.krakenuniq.report.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.krakenuniq.report.txt": + type: file + description: | + KrakenUniq report containing statistics about classified + and unclassified reads. + pattern: "*.krakenuniq.report.txt" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@mjamy" - "@Midnighter" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test index 9e1d6700..16da8e45 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test @@ -32,7 +32,11 @@ nextflow_process { """ input[0] = [ [id:'test', single_end:true], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fasta' input[2] = UNTAR.out.untar.map { it[1] } @@ -45,13 +49,16 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'genome.krakenuniq.report.txt' }, - { assert file(process.out.unclassified_reads.get(0).get(1)).name == 'genome.unclassified.fasta.gz' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, + process.out.unclassified_reads, process.out.classified_assignment, process.out.versions ).match('fasta') }, @@ -69,7 +76,11 @@ nextflow_process { """ input[0] = [ [id:'test', single_end:true], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fastq' input[2] = UNTAR.out.untar.map { it[1] } @@ -82,10 +93,13 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'test_interleaved.krakenuniq.report.txt' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, process.out.unclassified_reads, @@ -108,8 +122,11 @@ nextflow_process { [id:'test', single_end:false], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) - ] + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fastq' input[2] = UNTAR.out.untar.map { it[1] } @@ -122,13 +139,16 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'test.krakenuniq.report.txt' }, - { assert file(process.out.unclassified_reads.get(0).get(1)).name == 'test.merged.unclassified.fastq.gz' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, + process.out.unclassified_reads, process.out.classified_assignment, process.out.versions ).match('fastq-paired') }, @@ -148,7 +168,11 @@ nextflow_process { """ input[0] = [ [id:'test', single_end:true], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fasta' input[2] = UNTAR.out.untar.map { it[1] } @@ -161,11 +185,13 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'genome.krakenuniq.report.txt' }, - { assert file(process.out.unclassified_reads.get(0).get(1)).name == 'genome.unclassified.fasta.gz' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, process.out.unclassified_reads, @@ -188,7 +214,11 @@ nextflow_process { """ input[0] = [ [id:'test', single_end:true], - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fastq' input[2] = UNTAR.out.untar.map { it[1] } @@ -201,10 +231,13 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'test_interleaved.krakenuniq.report.txt' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, process.out.unclassified_reads, @@ -229,8 +262,11 @@ nextflow_process { [id:'test', single_end:false], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) - ] + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true), + ], + ['sample_1', 'sample.2'] ] input[1] = 'fastq' input[2] = UNTAR.out.untar.map { it[1] } @@ -243,10 +279,13 @@ nextflow_process { } then { + def reports = process.out.report.get(0).get(1).collect { report -> file(report).name } + def expected = ['sample_1.krakenuniq.report.txt', 'sample.2.krakenuniq.report.txt'] + assertAll ( { assert process.success }, // Report contains a timestamp. - { assert file(process.out.report.get(0).get(1)).name == 'test.krakenuniq.report.txt' }, + { assertContainsInAnyOrder(reports, expected) }, { assert snapshot( process.out.classified_reads, process.out.unclassified_reads, diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap index 2a431be8..ca29cf7d 100644 --- a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap @@ -7,7 +7,10 @@ "id": "test", "single_end": true }, - "test_interleaved.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -16,7 +19,10 @@ "id": "test", "single_end": true }, - "test_interleaved.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -25,7 +31,10 @@ "id": "test", "single_end": true }, - "test_interleaved.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -33,30 +42,18 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-06T11:21:36.338887437" + "timestamp": "2024-10-30T14:25:43.618168582" }, "fastq-single": { "content": [ [ - [ - { - "id": "test", - "single_end": true - }, - "test_interleaved.classified.fastq.gz:md5,3bd95021a8fbced1be8039b990b28176" - ] + ], [ - [ - { - "id": "test", - "single_end": true - }, - "test_interleaved.unclassified.fastq.gz:md5,143c7eb70ca93cc2d5ea98767c370424" - ] + ], [ [ @@ -64,7 +61,10 @@ "id": "test", "single_end": true }, - "test_interleaved.krakenuniq.classified.txt:md5,88a734a9a9216cb0770a77f36c9f4e78" + [ + "sample.2.krakenuniq.classified.txt:md5,f885fa1cdbfc5460af0772219991bf6d", + "sample_1.krakenuniq.classified.txt:md5,88a734a9a9216cb0770a77f36c9f4e78" + ] ] ], [ @@ -72,21 +72,18 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-06T11:17:43.586414914" + "timestamp": "2024-10-30T15:21:33.941412985" }, "fastq-paired": { "content": [ [ - [ - { - "id": "test", - "single_end": false - }, - "test.merged.classified.fastq.gz:md5,dd7651837cce63e6108e28f4f019aedb" - ] + + ], + [ + ], [ [ @@ -94,7 +91,10 @@ "id": "test", "single_end": false }, - "test.krakenuniq.classified.txt:md5,ed5e19c7a88312cc04e483ac5f2579cd" + [ + "sample.2.krakenuniq.classified.txt:md5,ed5e19c7a88312cc04e483ac5f2579cd", + "sample_1.krakenuniq.classified.txt:md5,ed5e19c7a88312cc04e483ac5f2579cd" + ] ] ], [ @@ -102,10 +102,10 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-06T11:37:46.718293365" + "timestamp": "2024-10-30T15:21:43.622100223" }, "fasta-stub": { "content": [ @@ -115,7 +115,10 @@ "id": "test", "single_end": true }, - "genome.classified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.classified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.classified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -124,7 +127,10 @@ "id": "test", "single_end": true }, - "genome.unclassified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.unclassified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.unclassified.fasta.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -133,7 +139,10 @@ "id": "test", "single_end": true }, - "genome.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -141,10 +150,10 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-06T11:28:27.729550991" + "timestamp": "2024-10-30T14:25:33.871634213" }, "fastq-paired-stub": { "content": [ @@ -154,7 +163,10 @@ "id": "test", "single_end": false }, - "test.merged.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.merged.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.merged.classified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -163,7 +175,10 @@ "id": "test", "single_end": false }, - "test.merged.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.merged.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.merged.unclassified.fastq.gz:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -172,7 +187,10 @@ "id": "test", "single_end": false }, - "test.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + [ + "sample.2.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975", + "sample_1.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] ] ], [ @@ -180,21 +198,18 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-05T20:06:20.262529457" + "timestamp": "2024-10-30T14:25:54.663232573" }, "fasta": { "content": [ [ - [ - { - "id": "test", - "single_end": true - }, - "genome.classified.fasta.gz:md5,e73599798195a519ba2565c3f0275b93" - ] + + ], + [ + ], [ [ @@ -202,7 +217,10 @@ "id": "test", "single_end": true }, - "genome.krakenuniq.classified.txt:md5,8aafacd89a6aac98aaf512df0a7493d1" + [ + "sample.2.krakenuniq.classified.txt:md5,8aafacd89a6aac98aaf512df0a7493d1", + "sample_1.krakenuniq.classified.txt:md5,2bea6c2195c400a909a2d4cca2e3045e" + ] ] ], [ @@ -210,9 +228,9 @@ ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.1", + "nextflow": "24.10.0" }, - "timestamp": "2024-05-06T11:36:00.24752418" + "timestamp": "2024-10-30T14:45:02.199077563" } } \ No newline at end of file diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 8b38baca..c2e0a81d 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -388,16 +388,23 @@ workflow PROFILING { .map { meta, reads, db_meta, db -> def seqtype = (reads[0].name ==~ /.+?\.f\w{0,3}a(\.gz)?$/) ? 'fasta' : 'fastq' - [[id: db_meta.db_name, single_end: meta.single_end, seqtype: seqtype], reads, db_meta, db] + // We bundle the sample identifier with the sequencing files to undergo batching. + def prefix = params.perform_runmerging ? meta.id : "${meta.id}_${meta.run_accession}" + [[id: db_meta.db_name, single_end: meta.single_end, seqtype: seqtype], reads + [prefix], db_meta, db] } .groupTuple(by: [0,2,3]) .flatMap { single_meta, reads, db_meta, db -> def batches = reads.collate(params.krakenuniq_batch_size) - return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]} + return batches.collect { batch -> + // We split the sample identifier from the reads again after batching. + def reads_batch = batch.collect { elements -> elements.take(elements.size() - 1) }.flatten() + def prefixes = batch.collect { elements -> elements[-1] } + return [ single_meta + db_meta, reads_batch, prefixes, db ] + } } .multiMap { - meta, reads, db -> - reads: [ meta, reads ] + meta, reads, prefixes, db -> + reads: [ meta, reads, prefixes ] db: db seqtype: meta.seqtype }