From 528ae354a61d04b4e4309b49d604fdfe04917b15 Mon Sep 17 00:00:00 2001 From: Adam Talbot Date: Wed, 2 Nov 2022 12:57:33 +0000 Subject: [PATCH 1/7] Added support for third UMI fastq file --- assets/samplesheet.csv | 3 ++- bin/check_samplesheet.py | 20 +++++++++++++------- subworkflows/local/input_check.nf | 7 ++++++- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 7d41d51..d53ed0b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fastq_1,fastq_2,read_structure +sample,fastq_1,fastq_2,read_structure,fastq_umi SAMPLE_DUPLEX_SEQ,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,10M1S+T 10M1S+T SAMPLE_SINGLE_UMI,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz,12M+T +T +SAMPLE_UMI_FASTQ,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R3_001.fastq.gz,+T +T +M,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index e0c6509..50dbf68 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -73,7 +73,9 @@ def validate_and_transform(self, row): self._validate_first(row) self._validate_second(row) self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col], row[self._second_col])) + self._seen.add( + (row[self._sample_col], row[self._first_col], row[self._second_col]) + ) self.modified.append(row) def _validate_sample(self, row): @@ -95,14 +97,14 @@ def _validate_second(self, row): def _validate_pair(self, row): """Assert that read pairs have the same file extension. Report pair status.""" assert ( - Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] + Path(row[self._first_col]).suffixes[-2:] + == Path(row[self._second_col]).suffixes[-2:] ), "FASTQ pairs must have the same file extensions." def _validate_read_structure(self, row): """Assert that the second FASTQ entry has the right format if it exists.""" - assert len(row[self._read_structure_col].split(' ')) == 2, ( - "Two read structures must be provided." - ) + n_structures = len(row[self._read_structure_col].split(" ")) + assert 2 <= n_structures <= 3, "Two read structures must be provided." def _validate_fastq_format(self, filename): """Assert that a given filename has one of the expected FASTQ extensions.""" @@ -119,7 +121,9 @@ def validate_unique_samples(self): FASTQ file combination exists. """ - assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." + assert len(self._seen) == len( + self.modified + ), "The pair of sample name and FASTQ must be unique." if len({pair[0] for pair in self._seen}) < len(self._seen): counts = Counter(pair[0] for pair in self._seen) seen = Counter() @@ -192,7 +196,9 @@ def check_samplesheet(file_in, file_out): reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + logger.critical( + f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}." + ) sys.exit(1) # Validate each row. checker = RowChecker() diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index bb11361..774fa2a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -35,6 +35,11 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + if (file(row.fastq_umi).exists()){ + fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ] + } else { + fastq_list = [ file(row.fastq_1), file(row.fastq_2) ] + } + fastq_meta = [ meta, fastq_list ] return fastq_meta } From 3115c32dd41c67adcd5951956ae678f74a62d141 Mon Sep 17 00:00:00 2001 From: Adam Talbot Date: Wed, 2 Nov 2022 13:05:23 +0000 Subject: [PATCH 2/7] Better handling of UMI Fastq existing --- subworkflows/local/input_check.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 774fa2a..a6c4c3c 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -35,7 +35,10 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - if (file(row.fastq_umi).exists()){ + if (row.fastq_umi){ + if (!file(row.fastq_umi).exists()) { + exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}" + } fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ] } else { fastq_list = [ file(row.fastq_1), file(row.fastq_2) ] From 0951eb2cf4a55471ac00269b6ef9d8dbe22e4749 Mon Sep 17 00:00:00 2001 From: Adam Talbot Date: Wed, 2 Nov 2022 13:30:07 +0000 Subject: [PATCH 3/7] bugfix: BWA index correctly interpreted when .64 suffix is on files --- modules/local/align_bam/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/align_bam/main.nf b/modules/local/align_bam/main.nf index de6d104..287592f 100644 --- a/modules/local/align_bam/main.nf +++ b/modules/local/align_bam/main.nf @@ -52,7 +52,7 @@ process ALIGN_BAM { """ # The real path to the FASTA - FASTA=`find -L ./ -name "*.amb" | sed 's/.amb//'` + FASTA=`find -L ./ -name "*.amb" | sed -r 's/(.64)?.amb//'` samtools fastq ${samtools_fastq_args} ${unmapped_bam} \\ | bwa mem ${bwa_args} -t $task.cpus -p -K 150000000 -Y \$FASTA - \\ From f510bc2eec412d541fa9d11d93580203f1054268 Mon Sep 17 00:00:00 2001 From: Adam Talbot Date: Wed, 2 Nov 2022 13:30:49 +0000 Subject: [PATCH 4/7] Software versions exported correctly via ch_versions channel --- workflows/fastquorum.nf | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/workflows/fastquorum.nf b/workflows/fastquorum.nf index f724dfb..e0a4330 100644 --- a/workflows/fastquorum.nf +++ b/workflows/fastquorum.nf @@ -120,25 +120,24 @@ workflow FASTQUORUM { ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - // // MODULE: Run fgbio FastqToBam // FASTQTOBAM(INPUT_CHECK.out.reads) + ch_versions = ch_versions.mix(FASTQTOBAM.out.versions.first()) // // MODULE: Align with bwa mem // grouped_sort = true ALIGN_RAW_BAM(FASTQTOBAM.out.bam, ch_ref_index_dir, grouped_sort) + ch_versions = ch_versions.mix(ALIGN_RAW_BAM.out.versions) // // MODULE: Run fgbio GroupReadsByUmi // GROUPREADSBYUMI(ALIGN_RAW_BAM.out.bam, groupreadsbyumi_strategy, params.groupreadsbyumi_edits) + ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions.first()) // TODO: duplex_seq can be inferred from the read structure, but that's out of scope for now if (params.duplex_seq) { @@ -146,19 +145,24 @@ workflow FASTQUORUM { // MODULE: Run fgbio CallDuplexConsensusReads // CALLDDUPLEXCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq) + ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first()) // // MODULE: Run fgbio CollecDuplexSeqMetrics // COLLECTDUPLEXSEQMETRICS(GROUPREADSBYUMI.out.bam) + ch_versions = ch_versions.mix(COLLECTDUPLEXSEQMETRICS.out.versions.first()) // Add the consensus BAM to the channel for downstream processing CALLDDUPLEXCONSENSUSREADS.out.bam.set { ch_consensus_bam } + ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first()) + } else { // // MODULE: Run fgbio CallMolecularConsensusReads // CALLMOLECULARCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq) + ch_versions = ch_versions.mix(CALLMOLECULARCONSENSUSREADS.out.versions.first()) // Add the consensus BAM to the channel for downstream processing CALLMOLECULARCONSENSUSREADS.out.bam.set { ch_consensus_bam } @@ -168,11 +172,17 @@ workflow FASTQUORUM { // MODULE: Align with bwa mem // ALIGN_CONSENSUS_BAM(ch_consensus_bam, ch_ref_index_dir, false) + ch_versions = ch_versions.mix(ALIGN_CONSENSUS_BAM.out.versions.first()) // // MODULE: Run fgbio FilterConsensusReads // FILTERCONSENSUSREADS(ALIGN_CONSENSUS_BAM.out.bam, ch_ref_fasta, filter_min_reads, params.filter_min_baseq, params.filter_max_base_error_rate) + ch_versions = ch_versions.mix(FILTERCONSENSUSREADS.out.versions.first()) + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) // // MODULE: MultiQC From f672538190250a19502f72d5da6a0bf236805088 Mon Sep 17 00:00:00 2001 From: Adam Talbot Date: Wed, 2 Nov 2022 13:34:25 +0000 Subject: [PATCH 5/7] Raise error when UMI FASTQ file and duplex mode are used together --- subworkflows/local/input_check.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index a6c4c3c..cd0ccde 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -39,6 +39,9 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_umi).exists()) { exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}" } + if ( params.duplex_seq ) { + exit 1, "ERROR: Duplex mode is not compatible with a UMI sequencing file. Please use --duplex_seq false when using a UMI fastq file." + } fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ] } else { fastq_list = [ file(row.fastq_1), file(row.fastq_2) ] From d3777d7987354ff6b6a89535d5ef1912a8162e93 Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Wed, 15 May 2024 16:49:07 +0100 Subject: [PATCH 6/7] Support additonal FASTQ sequence designed for UMI file Changes: - Parse input subworkflow to support 3rd FASTQ in addition to R1 and R2 - Checks number of FASTQ files matches the number of read structures --- assets/schema_input.json | 9 +++++- .../utils_nfcore_fastquorum_pipeline/main.nf | 31 ++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 2697ff5..bcd607e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -27,6 +27,13 @@ "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, + "fastq_3": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "errorMessage": "FastQ file for reads 3 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + }, "read_structure": { "type": "string", "pattern": "^.*$", @@ -34,6 +41,6 @@ "meta": ["read_structure"] } }, - "required": ["sample", "fastq_1", "fastq_2", "read_structure"] + "required": ["sample", "fastq_1", "read_structure"] } } diff --git a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf index 4d8c056..ee1b6d4 100644 --- a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf @@ -92,13 +92,18 @@ workflow PIPELINE_INITIALISATION { Channel .fromSamplesheet("input") .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { + meta, fastq_1, fastq_2, fastq_3 -> + if (fastq_3) { + return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2, fastq_3 ] ] + } else if (fastq_2) { return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] + } else { + return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] } } + .map { + validateReadStructure(it) + } .groupTuple() .map { validateInputSamplesheet(it) @@ -109,6 +114,8 @@ workflow PIPELINE_INITIALISATION { } .set { ch_samplesheet } + ch_samplesheet.view() + emit: samplesheet = ch_samplesheet versions = ch_versions @@ -163,6 +170,21 @@ def validateInputParameters() { genomeExistsError() } +def validateReadStructure(input) { + def id = input[0] + def meta = input[1] + def fastqs = input[2] + + def num_fastqs = fastqs.size() + def num_structures = meta.read_structure.tokenize(" ").size() + + if (num_fastqs != num_structures) { + error("Please check input samplesheet -> Number of fastq files (${num_fastqs}) does not match the number of read structures (${num_structures}): ${id}, '${meta.read_structure}'") + } + return [ id, meta, fastqs ] +} + + // // Validate channels from input samplesheet // @@ -177,6 +199,7 @@ def validateInputSamplesheet(input) { return [ metas[0], fastqs ] } + // // Get attribute from genome config file e.g. fasta // From 7b354701b68f071c4daca5a4a15db921d17e7b1e Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Wed, 15 May 2024 18:30:34 +0100 Subject: [PATCH 7/7] Drop extra view statement --- subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf index ee1b6d4..ede0bd0 100644 --- a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf @@ -114,8 +114,6 @@ workflow PIPELINE_INITIALISATION { } .set { ch_samplesheet } - ch_samplesheet.view() - emit: samplesheet = ch_samplesheet versions = ch_versions