From 528ae354a61d04b4e4309b49d604fdfe04917b15 Mon Sep 17 00:00:00 2001
From: Adam Talbot <adam.talbot@nonacus.com>
Date: Wed, 2 Nov 2022 12:57:33 +0000
Subject: [PATCH 1/7] Added support for third UMI fastq file

---
 assets/samplesheet.csv            |  3 ++-
 bin/check_samplesheet.py          | 20 +++++++++++++-------
 subworkflows/local/input_check.nf |  7 ++++++-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
index 7d41d51..d53ed0b 100644
--- a/assets/samplesheet.csv
+++ b/assets/samplesheet.csv
@@ -1,3 +1,4 @@
-sample,fastq_1,fastq_2,read_structure
+sample,fastq_1,fastq_2,read_structure,fastq_umi
 SAMPLE_DUPLEX_SEQ,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,10M1S+T 10M1S+T
 SAMPLE_SINGLE_UMI,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz,12M+T +T
+SAMPLE_UMI_FASTQ,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R3_001.fastq.gz,+T +T +M,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index e0c6509..50dbf68 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -73,7 +73,9 @@ def validate_and_transform(self, row):
         self._validate_first(row)
         self._validate_second(row)
         self._validate_pair(row)
-        self._seen.add((row[self._sample_col], row[self._first_col], row[self._second_col]))
+        self._seen.add(
+            (row[self._sample_col], row[self._first_col], row[self._second_col])
+        )
         self.modified.append(row)
 
     def _validate_sample(self, row):
@@ -95,14 +97,14 @@ def _validate_second(self, row):
     def _validate_pair(self, row):
         """Assert that read pairs have the same file extension. Report pair status."""
         assert (
-            Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:]
+            Path(row[self._first_col]).suffixes[-2:]
+            == Path(row[self._second_col]).suffixes[-2:]
         ), "FASTQ pairs must have the same file extensions."
 
     def _validate_read_structure(self, row):
         """Assert that the second FASTQ entry has the right format if it exists."""
-        assert len(row[self._read_structure_col].split(' ')) == 2, (
-            "Two read structures must be provided."
-        )
+        n_structures = len(row[self._read_structure_col].split(" "))
+        assert 2 <= n_structures <= 3, "Two read structures must be provided."
 
     def _validate_fastq_format(self, filename):
         """Assert that a given filename has one of the expected FASTQ extensions."""
@@ -119,7 +121,9 @@ def validate_unique_samples(self):
         FASTQ file combination exists.
 
         """
-        assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
+        assert len(self._seen) == len(
+            self.modified
+        ), "The pair of sample name and FASTQ must be unique."
         if len({pair[0] for pair in self._seen}) < len(self._seen):
             counts = Counter(pair[0] for pair in self._seen)
             seen = Counter()
@@ -192,7 +196,9 @@ def check_samplesheet(file_in, file_out):
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
         # Validate the existence of the expected header columns.
         if not required_columns.issubset(reader.fieldnames):
-            logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.")
+            logger.critical(
+                f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}."
+            )
             sys.exit(1)
         # Validate each row.
         checker = RowChecker()
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index bb11361..774fa2a 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -35,6 +35,11 @@ def create_fastq_channel(LinkedHashMap row) {
     if (!file(row.fastq_2).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
     }
-    fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+    if (file(row.fastq_umi).exists()){
+        fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ]
+    } else {
+        fastq_list = [ file(row.fastq_1), file(row.fastq_2) ]
+    }
+    fastq_meta = [ meta, fastq_list ]
     return fastq_meta
 }

From 3115c32dd41c67adcd5951956ae678f74a62d141 Mon Sep 17 00:00:00 2001
From: Adam Talbot <adam.talbot@nonacus.com>
Date: Wed, 2 Nov 2022 13:05:23 +0000
Subject: [PATCH 2/7] Better handling of UMI Fastq existing

---
 subworkflows/local/input_check.nf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 774fa2a..a6c4c3c 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -35,7 +35,10 @@ def create_fastq_channel(LinkedHashMap row) {
     if (!file(row.fastq_2).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
     }
-    if (file(row.fastq_umi).exists()){
+    if (row.fastq_umi){
+        if (!file(row.fastq_umi).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}"
+        }
         fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ]
     } else {
         fastq_list = [ file(row.fastq_1), file(row.fastq_2) ]

From 0951eb2cf4a55471ac00269b6ef9d8dbe22e4749 Mon Sep 17 00:00:00 2001
From: Adam Talbot <adam.talbot@nonacus.com>
Date: Wed, 2 Nov 2022 13:30:07 +0000
Subject: [PATCH 3/7] bugfix: BWA index correctly interpreted when .64 suffix
 is on files

---
 modules/local/align_bam/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/align_bam/main.nf b/modules/local/align_bam/main.nf
index de6d104..287592f 100644
--- a/modules/local/align_bam/main.nf
+++ b/modules/local/align_bam/main.nf
@@ -52,7 +52,7 @@ process ALIGN_BAM {
 
     """
     # The real path to the FASTA
-    FASTA=`find -L ./ -name "*.amb" | sed 's/.amb//'`
+    FASTA=`find -L ./ -name "*.amb" | sed -r 's/(.64)?.amb//'`
 
     samtools fastq ${samtools_fastq_args} ${unmapped_bam} \\
         | bwa mem ${bwa_args} -t $task.cpus -p -K 150000000 -Y \$FASTA - \\

From f510bc2eec412d541fa9d11d93580203f1054268 Mon Sep 17 00:00:00 2001
From: Adam Talbot <adam.talbot@nonacus.com>
Date: Wed, 2 Nov 2022 13:30:49 +0000
Subject: [PATCH 4/7] Software versions exported correctly via ch_versions
 channel

---
 workflows/fastquorum.nf | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/workflows/fastquorum.nf b/workflows/fastquorum.nf
index f724dfb..e0a4330 100644
--- a/workflows/fastquorum.nf
+++ b/workflows/fastquorum.nf
@@ -120,25 +120,24 @@ workflow FASTQUORUM {
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
-    )
-
     //
     // MODULE: Run fgbio FastqToBam
     //
     FASTQTOBAM(INPUT_CHECK.out.reads)
+    ch_versions = ch_versions.mix(FASTQTOBAM.out.versions.first())
 
     //
     // MODULE: Align with bwa mem
     //
     grouped_sort = true
     ALIGN_RAW_BAM(FASTQTOBAM.out.bam, ch_ref_index_dir, grouped_sort)
+    ch_versions = ch_versions.mix(ALIGN_RAW_BAM.out.versions)
 
     //
     // MODULE: Run fgbio GroupReadsByUmi
     //
     GROUPREADSBYUMI(ALIGN_RAW_BAM.out.bam, groupreadsbyumi_strategy, params.groupreadsbyumi_edits)
+    ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions.first())
 
     // TODO: duplex_seq can be inferred from the read structure, but that's out of scope for now
     if (params.duplex_seq) {
@@ -146,19 +145,24 @@ workflow FASTQUORUM {
         // MODULE: Run fgbio CallDuplexConsensusReads
         //
         CALLDDUPLEXCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
+        ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())
 
         //
         // MODULE: Run fgbio CollecDuplexSeqMetrics
         //
         COLLECTDUPLEXSEQMETRICS(GROUPREADSBYUMI.out.bam)
+        ch_versions = ch_versions.mix(COLLECTDUPLEXSEQMETRICS.out.versions.first())
 
         // Add the consensus BAM to the channel for downstream processing
         CALLDDUPLEXCONSENSUSREADS.out.bam.set { ch_consensus_bam }
+        ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())
+
     } else {
         //
         // MODULE: Run fgbio CallMolecularConsensusReads
         //
         CALLMOLECULARCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
+        ch_versions = ch_versions.mix(CALLMOLECULARCONSENSUSREADS.out.versions.first())
 
         // Add the consensus BAM to the channel for downstream processing
         CALLMOLECULARCONSENSUSREADS.out.bam.set { ch_consensus_bam }
@@ -168,11 +172,17 @@ workflow FASTQUORUM {
     // MODULE: Align with bwa mem
     //
     ALIGN_CONSENSUS_BAM(ch_consensus_bam, ch_ref_index_dir, false)
+    ch_versions = ch_versions.mix(ALIGN_CONSENSUS_BAM.out.versions.first())
 
     //
     // MODULE: Run fgbio FilterConsensusReads
     //
     FILTERCONSENSUSREADS(ALIGN_CONSENSUS_BAM.out.bam, ch_ref_fasta, filter_min_reads, params.filter_min_baseq, params.filter_max_base_error_rate)
+    ch_versions = ch_versions.mix(FILTERCONSENSUSREADS.out.versions.first())
+
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
 
     //
     // MODULE: MultiQC

From f672538190250a19502f72d5da6a0bf236805088 Mon Sep 17 00:00:00 2001
From: Adam Talbot <adam.talbot@nonacus.com>
Date: Wed, 2 Nov 2022 13:34:25 +0000
Subject: [PATCH 5/7] Raise error when UMI FASTQ file and duplex mode are used
 together

---
 subworkflows/local/input_check.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index a6c4c3c..cd0ccde 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -39,6 +39,9 @@ def create_fastq_channel(LinkedHashMap row) {
         if (!file(row.fastq_umi).exists()) {
             exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}"
         }
+        if ( params.duplex_seq ) {
+            exit 1, "ERROR: Duplex mode is not compatible with a UMI sequencing file. Please use --duplex_seq false when using a UMI fastq file."
+        }
         fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ]
     } else {
         fastq_list = [ file(row.fastq_1), file(row.fastq_2) ]

From d3777d7987354ff6b6a89535d5ef1912a8162e93 Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Wed, 15 May 2024 16:49:07 +0100
Subject: [PATCH 6/7] Support additonal FASTQ sequence designed for UMI file

Changes:
 - Parse input subworkflow to support 3rd FASTQ in addition to R1 and R2
 - Checks number of FASTQ files matches the number of read structures
---
 assets/schema_input.json                      |  9 +++++-
 .../utils_nfcore_fastquorum_pipeline/main.nf  | 31 ++++++++++++++++---
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index 2697ff5..bcd607e 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -27,6 +27,13 @@
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
+            "fastq_3": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
+                "errorMessage": "FastQ file for reads 3 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+            },
             "read_structure": {
                 "type": "string",
                 "pattern": "^.*$",
@@ -34,6 +41,6 @@
                 "meta": ["read_structure"]
             }
         },
-        "required": ["sample", "fastq_1", "fastq_2", "read_structure"]
+        "required": ["sample", "fastq_1", "read_structure"]
     }
 }
diff --git a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
index 4d8c056..ee1b6d4 100644
--- a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
@@ -92,13 +92,18 @@ workflow PIPELINE_INITIALISATION {
     Channel
         .fromSamplesheet("input")
         .map {
-            meta, fastq_1, fastq_2 ->
-                if (!fastq_2) {
-                    return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ]
-                } else {
+            meta, fastq_1, fastq_2, fastq_3 ->
+                if (fastq_3) {
+                    return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2, fastq_3 ] ]
+                } else if (fastq_2) {
                     return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ]
+                } else {
+                    return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ]
                 }
         }
+        .map {
+            validateReadStructure(it)
+        }
         .groupTuple()
         .map {
             validateInputSamplesheet(it)
@@ -109,6 +114,8 @@ workflow PIPELINE_INITIALISATION {
         }
         .set { ch_samplesheet }
 
+    ch_samplesheet.view()
+
     emit:
     samplesheet = ch_samplesheet
     versions    = ch_versions
@@ -163,6 +170,21 @@ def validateInputParameters() {
     genomeExistsError()
 }
 
+def validateReadStructure(input) {
+    def id           = input[0]
+    def meta         = input[1]
+    def fastqs       = input[2]
+
+    def num_fastqs     = fastqs.size()
+    def num_structures = meta.read_structure.tokenize(" ").size()
+
+    if (num_fastqs != num_structures) {
+        error("Please check input samplesheet -> Number of fastq files (${num_fastqs}) does not match the number of read structures (${num_structures}): ${id}, '${meta.read_structure}'")
+    }
+    return [ id, meta, fastqs ]
+}
+
+
 //
 // Validate channels from input samplesheet
 //
@@ -177,6 +199,7 @@ def validateInputSamplesheet(input) {
 
     return [ metas[0], fastqs ]
 }
+
 //
 // Get attribute from genome config file e.g. fasta
 //

From 7b354701b68f071c4daca5a4a15db921d17e7b1e Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Wed, 15 May 2024 18:30:34 +0100
Subject: [PATCH 7/7] Drop extra view statement

---
 subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
index ee1b6d4..ede0bd0 100644
--- a/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_fastquorum_pipeline/main.nf
@@ -114,8 +114,6 @@ workflow PIPELINE_INITIALISATION {
         }
         .set { ch_samplesheet }
 
-    ch_samplesheet.view()
-
     emit:
     samplesheet = ch_samplesheet
     versions    = ch_versions