Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor samplesheet to handle multiple treatments & controls #44

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ replay_pid*
work/
data/
results/
output/
/params.yaml

# python packaging
Expand Down
Binary file modified assets/dag.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions assets/samplesheet_test_biowulf.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions assets/samplesheet_test_mle.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 18 additions & 14 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import errno
import argparse
import pprint


def parse_args(args=None):
Expand Down Expand Up @@ -51,7 +52,7 @@ def check_samplesheet(file_in, file_out):
with open(file_in, "r", encoding="utf-8-sig") as fin:
## Check header
MIN_COLS = 2
HEADER = ["sample", "fastq_1", "fastq_2", "treat_or_ctrl"]
HEADER = ["sample", "rep", "fastq_1", "fastq_2", "control"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
Expand Down Expand Up @@ -81,7 +82,10 @@ def check_samplesheet(file_in, file_out):
)

## Check sample name entries
sample, fastq_1, fastq_2, treat_or_ctrl = lspl[: len(HEADER)]
sample_basename, rep, fastq_1, fastq_2, control = lspl[: len(HEADER)]
print("lspl")
pprint.pprint(lspl)
sample = f"{sample_basename}_{rep}" if rep else sample_basename
if sample.find(" ") != -1:
print(
f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
Expand All @@ -95,21 +99,13 @@ def check_samplesheet(file_in, file_out):
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
if not fastq.endswith("fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"FastQ file does not have extension 'fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## check treatment or control designation
if treat_or_ctrl not in {"treatment", "control"}:
print_error(
"treat_or_ctrl column can only contain values `treatment` or `control`.",
"Line",
line,
)

## Auto-detect paired-end/single-end
if sample and fastq_1 and fastq_2: ## Paired-end short reads
is_single = "0"
Expand All @@ -118,7 +114,7 @@ def check_samplesheet(file_in, file_out):
else:
print_error("Invalid combination of columns provided!", "Line", line)

sample_info = [is_single, fastq_1, fastq_2, treat_or_ctrl]
sample_info = [sample_basename, rep, is_single, fastq_1, fastq_2, control]
## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]}
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
Expand All @@ -135,7 +131,15 @@ def check_samplesheet(file_in, file_out):
with open(file_out, "w") as fout:
fout.write(
",".join(
["sample", "single_end", "fastq_1", "fastq_2", "treat_or_ctrl"]
[
"sample",
"sample_basename",
"rep",
"single_end",
"fastq_1",
"fastq_2",
"control",
]
)
+ "\n"
)
Expand Down
2 changes: 1 addition & 1 deletion conf/biowulf.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ env.SINGULARITY_CACHEDIR = "/data/CCBR_Pipeliner/SIFS"


process.clusterOptions = ' --gres=lscratch:200 '
process.scratch = '/lscratch/$SLURM_JOBID'
process.scratch = '/lscratch/$SLURM_JOB_ID'
process.stageInMode = 'symlink'
process.stageOutMode = 'rsync'
// for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps
Expand Down
4 changes: 2 additions & 2 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ params {

input = 'assets/samplesheet_test_biowulf.csv'
library = 'assets/lib/yusa_library.csv'
outdir = 'results/test'
outdir = 'output/test'
exp_name = 'test'

publish_dir_mode = 'symlink'
publish_dir_mode = 'link'

mageck.run = true

Expand Down
4 changes: 3 additions & 1 deletion conf/test_mle.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ params {

input = 'assets/samplesheet_test_mle.csv'
library = 'assets/lib/yusa_library.csv'
outdir = 'results/test_mle'
outdir = 'output/test_mle'
exp_name = 'leukemia'

publish_dir_mode = 'link'

count_table = '/data/CCBR_Pipeliner/testdata/CRISPIN/leukemia.new.csv'
design_matrix = '/data/CCBR_Pipeliner/testdata/CRISPIN/designmat.txt'

Expand Down
32 changes: 27 additions & 5 deletions lib/Utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,39 @@ class Utils {
def command_string = "spooker ${workflow.launchDir} ${pipeline_name}"
def out = new StringBuilder()
def err = new StringBuilder()
def spooker_in_path = check_command_in_path("spooker")
if (spooker_in_path) {
try {
println "Running spooker"
def command = command_string.execute()
command.consumeProcessOutput(out, err)
command.waitFor()
} catch(IOException e) {
err = e
}
// TODO create log/ dir if it doesn't exist
new FileWriter("${workflow.launchDir}/log/spooker.log").with {
write("${out}\n${err}")
flush()
}
} else {
err = "spooker not found, skipping"
}
return err
}
// check whether a command is in the path
public static Boolean check_command_in_path(cmd) {
def command_string = "command -V ${cmd}"
def out = new StringBuilder()
def err = new StringBuilder()
try {
def command = command_string.execute()
command.consumeProcessOutput(out, err)
command.waitFor()
} catch(IOException e) {
err = e
}
new FileWriter("${workflow.launchDir}/log/spooker.log").with {
write("${out}\n${err}")
flush()
}
return err
return err.length()==0

}
}
78 changes: 56 additions & 22 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ input : ${params.input}

// SUBMODULES
include { INPUT_CHECK } from './subworkflows/local/input_check.nf'
include { TRIM_COUNT } from './subworkflows/local/trim_count.nf'
include { MAGECK } from './subworkflows/local/mageck.nf'
include { BAGEL } from './subworkflows/local/bagel.nf'

// MODULES
include { DRUGZ } from './modules/local/drugz.nf'
include { CUTADAPT } from './modules/CCBR/cutadapt'
include { MAGECK_COUNT } from './modules/local/mageck/count'
include { DRUGZ } from './modules/local/drugz'

workflow.onComplete {
if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) {
Expand All @@ -37,34 +38,67 @@ workflow.onComplete {

workflow {
INPUT_CHECK(file(params.input))
INPUT_CHECK.out
.reads
.set { raw_reads }
INPUT_CHECK.out.reads
| set { raw_reads }

ch_count = params.count_table ? file(params.count_table, checkIfExists: true) : null
if (!ch_count) { // trim reads and run mageck count
TRIM_COUNT(raw_reads, file(params.library, checkIfExists: true))
ch_count = TRIM_COUNT.out.count
if (params.count_table) {

// if count table exists, re-populate metadata with linked controls
ch_branched = raw_reads.branch { meta, fastqs ->
treat: meta.control.length() > 0
return [ [control: meta.control], meta.id ]
control: meta.control.length() == 0
return [ [control: meta.sample_basename], meta.id ]
}
ch_count = ch_branched.treat
| groupTuple()
| map { meta, ids ->
[ meta, ids.flatten().join(',')]
}
| join( ch_branched.control )
| map { meta, ids, ctrl ->
meta.ids = ids
[ meta ]
}
| combine( Channel.fromPath(file(params.count_table, checkIfExists: true)) )
ch_count | view
} else {
// trim reads and run mageck count
CUTADAPT(raw_reads)

// group samples with their controls
ch_branched = CUTADAPT.out.reads.branch { meta, fastqs ->
treat: meta.control.length() > 0
return [ [control: meta.control], meta.id, fastqs ]
control: meta.control.length() == 0
return [ [control: meta.sample_basename], fastqs ]
}
ch_treat_ctrl_linked = ch_branched.treat
| groupTuple()
| map { meta, ids, fastqs ->
[ meta, ids.flatten().join(','), fastqs.flatten() ]
}
| join( ch_branched.control )
| map { meta, ids, sample_fastqs, ctrl_fastqs ->
meta.ids = ids
[ meta, sample_fastqs, ctrl_fastqs ]
}

MAGECK_COUNT(ch_treat_ctrl_linked,
file(params.library, checkIfExists: true)
)
ch_count = MAGECK_COUNT.out.count
}

raw_reads
.branch { meta, fastq ->
treat: meta.treat_or_ctrl == 'treatment'
return meta.id
ctrl: meta.treat_or_ctrl == 'control'
return meta.id
}
.set { treat_meta }

treat = treat_meta.treat.collect()
control = treat_meta.ctrl.collect()
if (params.mageck.run) {
MAGECK(ch_count, treat, control)
MAGECK(ch_count)
}
if (params.drugz.run) {
DRUGZ(ch_count, treat, control)
DRUGZ(ch_count)
}
if (params.bagel.run) {
BAGEL(ch_count, control)
BAGEL(ch_count)
}

}
79 changes: 0 additions & 79 deletions modules/local/bagel.nf

This file was deleted.

27 changes: 27 additions & 0 deletions modules/local/bagel/bayes_factor/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

process BAGEL_BAYES_FACTOR {
tag { meta.control }
container "${params.containers.bagel}"
label 'process_single'

input:
tuple val(meta), path(fold_change)

output:
tuple val(meta), path("*.bf"), emit: bf

script:
"""
BAGEL.py bf \\
-i ${fold_change} \\
-o ${fold_change.getBaseName(2)}.bf \\
-e ${params.bagel.core_essential_genes} \\
-n ${params.bagel.non_essential_genes} \\
-c ${params.bagel.test_columns}
"""

stub:
"""
touch ${fold_change.getBaseName(2)}.bf
"""
}
Loading
Loading