CCBR · kelly-sovacool · May 23, 2024 · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ replay_pid*
 work/
 data/
 results/
+output/
 /params.yaml
 
 # python packaging

diff --git a/assets/dag.png b/assets/dag.png
diff --git a/assets/samplesheet_test_biowulf.csv b/assets/samplesheet_test_biowulf.csv
diff --git a/assets/samplesheet_test_mle.csv b/assets/samplesheet_test_mle.csv
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -8,6 +8,7 @@
 import sys
 import errno
 import argparse
+import pprint
 
 
 def parse_args(args=None):
@@ -51,7 +52,7 @@ def check_samplesheet(file_in, file_out):
     with open(file_in, "r", encoding="utf-8-sig") as fin:
         ## Check header
         MIN_COLS = 2
-        HEADER = ["sample", "fastq_1", "fastq_2", "treat_or_ctrl"]
+        HEADER = ["sample", "rep", "fastq_1", "fastq_2", "control"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
             print(
@@ -81,7 +82,10 @@ def check_samplesheet(file_in, file_out):
                 )
 
             ## Check sample name entries
-            sample, fastq_1, fastq_2, treat_or_ctrl = lspl[: len(HEADER)]
+            sample_basename, rep, fastq_1, fastq_2, control = lspl[: len(HEADER)]
+            print("lspl")
+            pprint.pprint(lspl)
+            sample = f"{sample_basename}_{rep}" if rep else sample_basename
             if sample.find(" ") != -1:
                 print(
                     f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
@@ -95,21 +99,13 @@ def check_samplesheet(file_in, file_out):
                 if fastq:
                     if fastq.find(" ") != -1:
                         print_error("FastQ file contains spaces!", "Line", line)
-                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                    if not fastq.endswith("fastq.gz") and not fastq.endswith(".fq.gz"):
                         print_error(
-                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            "FastQ file does not have extension 'fastq.gz' or '.fq.gz'!",
                             "Line",
                             line,
                         )
 
-            ## check treatment or control designation
-            if treat_or_ctrl not in {"treatment", "control"}:
-                print_error(
-                    "treat_or_ctrl column can only contain values `treatment` or `control`.",
-                    "Line",
-                    line,
-                )
-
             ## Auto-detect paired-end/single-end
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
                 is_single = "0"
@@ -118,7 +114,7 @@ def check_samplesheet(file_in, file_out):
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
-            sample_info = [is_single, fastq_1, fastq_2, treat_or_ctrl]
+            sample_info = [sample_basename, rep, is_single, fastq_1, fastq_2, control]
             ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]}
             if sample not in sample_mapping_dict:
                 sample_mapping_dict[sample] = [sample_info]
@@ -135,7 +131,15 @@ def check_samplesheet(file_in, file_out):
         with open(file_out, "w") as fout:
             fout.write(
                 ",".join(
-                    ["sample", "single_end", "fastq_1", "fastq_2", "treat_or_ctrl"]
+                    [
+                        "sample",
+                        "sample_basename",
+                        "rep",
+                        "single_end",
+                        "fastq_1",
+                        "fastq_2",
+                        "control",
+                    ]
                 )
                 + "\n"
             )

diff --git a/conf/biowulf.config b/conf/biowulf.config
@@ -28,7 +28,7 @@ env.SINGULARITY_CACHEDIR = "/data/CCBR_Pipeliner/SIFS"
 
 
 process.clusterOptions = ' --gres=lscratch:200 '
-process.scratch = '/lscratch/$SLURM_JOBID'
+process.scratch = '/lscratch/$SLURM_JOB_ID'
 process.stageInMode = 'symlink'
 process.stageOutMode = 'rsync'
 // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps

diff --git a/conf/test.config b/conf/test.config
@@ -4,10 +4,10 @@ params {
 
     input = 'assets/samplesheet_test_biowulf.csv'
     library = 'assets/lib/yusa_library.csv'
-    outdir = 'results/test'
+    outdir = 'output/test'
     exp_name = 'test'
 
-    publish_dir_mode = 'symlink'
+    publish_dir_mode = 'link'
 
     mageck.run = true
 

diff --git a/conf/test_mle.config b/conf/test_mle.config
@@ -4,9 +4,11 @@ params {
 
     input = 'assets/samplesheet_test_mle.csv'
     library = 'assets/lib/yusa_library.csv'
-    outdir = 'results/test_mle'
+    outdir = 'output/test_mle'
     exp_name = 'leukemia'
 
+    publish_dir_mode = 'link'
+
     count_table   = '/data/CCBR_Pipeliner/testdata/CRISPIN/leukemia.new.csv'
     design_matrix = '/data/CCBR_Pipeliner/testdata/CRISPIN/designmat.txt'
 

diff --git a/lib/Utils.groovy b/lib/Utils.groovy
@@ -5,17 +5,39 @@ class Utils {
         def command_string = "spooker ${workflow.launchDir} ${pipeline_name}"
         def out = new StringBuilder()
         def err = new StringBuilder()
+        def spooker_in_path = check_command_in_path("spooker")
+        if (spooker_in_path) {
+            try {
+                println "Running spooker"
+                def command = command_string.execute()
+                command.consumeProcessOutput(out, err)
+                command.waitFor()
+            } catch(IOException e) {
+                err = e
+            }
+            // TODO create log/ dir if it doesn't exist
+            new FileWriter("${workflow.launchDir}/log/spooker.log").with {
+                write("${out}\n${err}")
+                flush()
+            }
+        } else {
+            err = "spooker not found, skipping"
+        }
+        return err
+    }
+    // check whether a command is in the path
+    public static Boolean check_command_in_path(cmd) {
+        def command_string = "command -V ${cmd}"
+        def out = new StringBuilder()
+        def err = new StringBuilder()
         try {
             def command = command_string.execute()
             command.consumeProcessOutput(out, err)
             command.waitFor()
         } catch(IOException e) {
             err = e
         }
-        new FileWriter("${workflow.launchDir}/log/spooker.log").with {
-            write("${out}\n${err}")
-            flush()
-        }
-        return err
+        return err.length()==0
+
     }
 }
diff --git a/main.nf b/main.nf
@@ -18,12 +18,13 @@ input        : ${params.input}
 
 // SUBMODULES
 include { INPUT_CHECK } from './subworkflows/local/input_check.nf'
-include { TRIM_COUNT  } from './subworkflows/local/trim_count.nf'
 include { MAGECK      } from './subworkflows/local/mageck.nf'
 include { BAGEL       } from './subworkflows/local/bagel.nf'
 
 // MODULES
-include { DRUGZ } from './modules/local/drugz.nf'
+include { CUTADAPT      } from './modules/CCBR/cutadapt'
+include { MAGECK_COUNT  } from './modules/local/mageck/count'
+include { DRUGZ         } from './modules/local/drugz'
 
 workflow.onComplete {
     if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) {
@@ -37,34 +38,67 @@ workflow.onComplete {
 
 workflow {
     INPUT_CHECK(file(params.input))
-    INPUT_CHECK.out
-        .reads
-        .set { raw_reads }
+    INPUT_CHECK.out.reads
+        | set { raw_reads }
 
-    ch_count = params.count_table ? file(params.count_table, checkIfExists: true) : null
-    if (!ch_count) { // trim reads and run mageck count
-        TRIM_COUNT(raw_reads, file(params.library, checkIfExists: true))
-        ch_count = TRIM_COUNT.out.count
+    if (params.count_table) {
+
+        // if count table exists, re-populate metadata with linked controls
+        ch_branched = raw_reads.branch { meta, fastqs ->
+                treat: meta.control.length() > 0
+                    return [ [control: meta.control], meta.id ]
+                control: meta.control.length() == 0
+                    return [ [control: meta.sample_basename], meta.id ]
+                }
+        ch_count = ch_branched.treat
+                | groupTuple()
+                | map { meta, ids ->
+                    [ meta, ids.flatten().join(',')]
+                    }
+                | join( ch_branched.control )
+                | map { meta, ids, ctrl ->
+                    meta.ids = ids
+                    [ meta ]
+                    }
+                | combine( Channel.fromPath(file(params.count_table, checkIfExists: true)) )
+        ch_count | view
+    } else {
+        // trim reads and run mageck count
+        CUTADAPT(raw_reads)
+
+        // group samples with their controls
+        ch_branched = CUTADAPT.out.reads.branch { meta, fastqs ->
+                treat: meta.control.length() > 0
+                    return [ [control: meta.control], meta.id, fastqs ]
+                control: meta.control.length() == 0
+                    return [ [control: meta.sample_basename], fastqs ]
+                }
+        ch_treat_ctrl_linked = ch_branched.treat
+                | groupTuple()
+                | map { meta, ids, fastqs ->
+                    [ meta, ids.flatten().join(','), fastqs.flatten() ]
+                    }
+                | join( ch_branched.control )
+                | map { meta, ids, sample_fastqs, ctrl_fastqs ->
+                    meta.ids = ids
+                    [ meta, sample_fastqs, ctrl_fastqs ]
+                }
+
+        MAGECK_COUNT(ch_treat_ctrl_linked,
+                     file(params.library, checkIfExists: true)
+                    )
+        ch_count = MAGECK_COUNT.out.count
     }
 
-    raw_reads
-      .branch { meta, fastq ->
-        treat: meta.treat_or_ctrl == 'treatment'
-            return meta.id
-        ctrl: meta.treat_or_ctrl == 'control'
-            return meta.id
-      }
-      .set { treat_meta }
 
-    treat = treat_meta.treat.collect()
-    control = treat_meta.ctrl.collect()
     if (params.mageck.run) {
-        MAGECK(ch_count, treat, control)
+        MAGECK(ch_count)
     }
     if (params.drugz.run) {
-        DRUGZ(ch_count, treat, control)
+        DRUGZ(ch_count)
     }
     if (params.bagel.run) {
-        BAGEL(ch_count, control)
+        BAGEL(ch_count)
     }
+
 }
diff --git a/modules/local/bagel.nf b/modules/local/bagel.nf
diff --git a/modules/local/bagel/bayes_factor/main.nf b/modules/local/bagel/bayes_factor/main.nf
@@ -0,0 +1,27 @@
+
+process BAGEL_BAYES_FACTOR {
+    tag { meta.control }
+    container "${params.containers.bagel}"
+    label 'process_single'
+
+    input:
+        tuple val(meta), path(fold_change)
+
+    output:
+        tuple val(meta), path("*.bf"), emit: bf
+
+    script:
+    """
+    BAGEL.py bf \\
+      -i ${fold_change} \\
+      -o ${fold_change.getBaseName(2)}.bf \\
+      -e ${params.bagel.core_essential_genes} \\
+      -n ${params.bagel.non_essential_genes} \\
+      -c ${params.bagel.test_columns}
+    """
+
+    stub:
+    """
+    touch ${fold_change.getBaseName(2)}.bf
+    """
+}