Skip to content

Commit

Permalink
#36 Add CD-HIT as an optional process
Browse files Browse the repository at this point in the history
  • Loading branch information
Negin Valizadegan committed Dec 3, 2021
1 parent 9ce491f commit e770302
Showing 1 changed file with 104 additions and 80 deletions.
184 changes: 104 additions & 80 deletions filter.nf
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
#!/usr/bin/env nextflow

/*
LIST OF TOOLS USED IN THIS PIPLINE:
1. blastn --> BLAST+/2.10.1
2. seqkit --> seqkit/0.12.1
3. kraken2 --> Kraken2/2.0.8
4. cd-hit --> CD-HIT/4.8.1
*/

/*Parameters that are specified at the command line or via config file*/
params.genome1 = false /*genome fasta file GRCh38, must specify complete path. Required parameters*/
params.genome2 = false /*genome fasta file GRCh38.p0, must specify complete path. Required parameters*/
params.genome3 = false /*genome fasta file, CHM13, must specify complete path. Required parameters*/
params.krakendb = false /*kraken database file, must specify complete path. Required parameters*/
params.samplePath = false /*input folder, must specify complete path. Required parameters*/
params.taxdbPath = false /*location of blast taxa database. Required parameters*/
params.genome1 = false /*genome fasta file GRCh38, must specify complete path. Required parameter*/
params.genome2 = false /*genome fasta file GRCh38.p0, must specify complete path. Required parameter*/
params.genome3 = false /*genome fasta file, CHM13, must specify complete path. Required parameter*/
params.krakendb = false /*kraken database file, must specify complete path. Required parameter*/
params.samplePath = false /*input folder, must specify complete path. Required parameter*/
params.taxdbPath = false /*location of blast taxa database. Required parameter*/
params.skipcdhit = false /* If true, cdhit would not be run. Optional parameter*/

/*Parameters to be used inside the pipeline */
params.outputDir = "./results" /*output folder, must specify path from current directory. Required parameters*/
params.outputDir = "./results" /*output folder, must specify path from current directory. Required parameter*/
params.assembler = 'masurca' /*options: megahit|masurca. Default is masurca*/

/*Parameters for seqkit */
Expand All @@ -25,9 +34,11 @@ params.blastnt_filter_pident = '60' /*filtering cut off for percentag
params.blastnt_filter_length = '100' /*filtering cut off for alignment length from blast NT. Default is 100*/
params.blastr_filter_pident = '95' /*filtering cut off for percentage of identical matches from blast ref genome. Default is 95*/
params.blastr_filter_qcov = '95' /*filtering cut off for query coverage from blast ref genome. Default is 95*/
params.cdhit_identity = '0.9' /*proportion of idenitity for clustering using cdhit. Default is 0.9*/
params.cdhit_wordsize = '7' /*word size for cdhit. Default is 7*/

/*Stage*/
stage = "annotation"
stage = "filter"

/*Results path*/
resultsPath = "${params.outputDir}/${stage}"
Expand Down Expand Up @@ -152,7 +163,7 @@ process filter_seqkit {
queue params.myQueue
memory "$defaultMemory GB"
module "seqkit/0.12.1"
publishDir "${resultsPath}/seqkit/${params.assembler}/",mode:"copy"
publishDir "${resultsPath}/SeqKit/${params.assembler}/",mode:"copy"

input:
tuple val(id), file(fasta) from fasta_Ch1
Expand All @@ -168,7 +179,7 @@ process filter_seqkit {
# Create seqkit stats before and after filtering ------
seqkit stats ${fasta} > ${id}_filter_stats.txt
seqkit stats ${id}_minLength_${params.min_read_length}.fasta | sed -e '1d' >> ${id}_filter_stats.txt
seqkit stats ${id}_minLength_${params.min_read_length}.fasta | sed -e '1d' >> ${id}_filter_stats.tx
"""
}
Expand All @@ -178,15 +189,15 @@ process filter_seqkit {
/*
2.1: RUN KRAKEN ON FILTERED READS
*/
process kraken {
process kraken2 {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "Kraken2/2.0.8-beta-IGB-gcc-4.9.4"
publishDir "${resultsPath}/kraken/${params.assembler}/",mode:"copy"
publishDir "${resultsPath}/Kraken2/${params.assembler}/",mode:"copy"

input:
tuple val(id), file(filtered_ln1) from filtered_ln_kraken
Expand Down Expand Up @@ -215,29 +226,31 @@ process kraken {
cut -f2 ${id}_kraken2_output_filtered.txt > ${id}_to_keep.txt
"""
}

/*
2.2: RUN BLAST (NT) ON FILTERED READS
*/
process blastncontam {
process blastcontam {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "BLAST+/2.10.1-IGB-gcc-8.2.0","ncbi-blastdb/20201212","seqkit/0.12.1"
publishDir "${resultsPath}/blast-contam/${params.assembler}/",mode:"copy"
publishDir "${resultsPath}/BLAST_Contam/${params.assembler}/",mode:"copy"


input:
tuple val(id), file(keep), file(filtered_ln2) from keep_list_kn.join(filtered_ln_blastnt)

output:
tuple val(id), file('*_kn_filtered.fasta') into kraken_filtered
tuple val(id), file('*_kn_filtered.fasta')
tuple val(id), file('blast_*_nt0.txt') optional true
tuple val(id), file('blast_*_nt.txt')
tuple val(id), file('*_to_remove_nt.txt')
tuple val(id), file('*_ids_remove_nt.txt') into remove_list_bl
tuple val(id), file('*_ids_remove_nt.txt')
tuple val(id), file('*_blst_kn_filtered.fasta') into blast_kn_filtered, blast_kn_filtered2, blast_kn_filtered3

script:
"""
Expand Down Expand Up @@ -268,24 +281,65 @@ process blastncontam {
awk -F"\t" '(\$4 > ${params.blastnt_filter_pident} && \$5 > ${params.blastnt_filter_length})' \
> ${id}_to_remove_nt.txt
cut -f1 ${id}_to_remove_nt.txt > ${id}_ids_remove_nt.txt
# Filter the fasta by the blast keep_list ------
seqkit grep -i -v -f ${id}_ids_remove_nt.txt ${id}_kn_filtered.fasta > ${id}_blst_kn_filtered.fasta
"""
}

/*
STEP 3: RUN CD-HIT ON FILTERED READS
*/
if (!params.skipcdhit) {
process cdhit {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "CD-HIT/4.8.1-IGB-gcc-8.2.0"
publishDir "${resultsPath}/CD-HIT/${params.assembler}/",mode:"copy"

input:
tuple val(id), file(filtered_kn_blst) from blast_kn_filtered

output:
tuple val(id), file('*_blst_kn_cdhit.fasta') into blast_kn_cdhit, blast_kn_cdhit2

script:
"""
# Use cd-hit to cluster and remove redundancy ------
cd-hit-est \
-i ${filtered_kn_blst} \
-o ${id}_blst_kn_cdhit.fasta \
-c ${params.cdhit_identity} \
-n ${params.cdhit_wordsize} \
-T ${task.cpus}
"""
}

} else {
blast_kn_filtered.into {blast_kn_cdhit;blast_kn_cdhit2}
}

/*
STEP 4: RUN BLAST ON FILTERED READS (GRCh38, GRCh38.p0, CHM13)
*/
process blastn {
process blastref {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "BLAST+/2.10.1-IGB-gcc-8.2.0","seqkit/0.12.1"
publishDir "${resultsPath}/blast/${params.assembler}/",mode:"copy"
publishDir "${resultsPath}/BLAST/${params.assembler}/",mode:"copy"

input:
tuple val(id), file(filtered_kn), file(remove_bl) from kraken_filtered.join(remove_list_bl)
tuple val(id), file(blast_kn_cdhit_filtered) from blast_kn_cdhit
file genome1 from genome_file1
file genome2 from genome_file2
file genome3 from genome_file3
Expand All @@ -294,25 +348,21 @@ process blastn {
file "*" from blast_db3_ch

output:
tuple val(id), file('*_blst_kn_filtered.fasta') into blast_kn_filtered
tuple val(id), file('blast_*.GRCh38.decoy.hla0.txt') optional true
tuple val(id), file('blast_*.GRCh38.decoy.hla.txt')
tuple val(id), file('blast_*.GRCh38.p0.no.decoy.hla0.txt') optional true
tuple val(id), file('blast_*.GRCh38.p0.no.decoy.hla.txt')
tuple val(id), file('blast_*.CHM13.v1.1_GRCh38.p13.chrY0.txt') optional true
tuple val(id), file('blast_*.GRCh38.decoy.hla_to_filter.txt')
tuple val(id), file('blast_*.CHM13.v1.1_GRCh38.p13.chrY_to_filter.txt')
tuple val(id), file('blast_*.GRCh38.p0.no.decoy.hla_to_filter.txt')
tuple val(id), file('blast_*.CHM13.v1.1_GRCh38.p13.chrY_to_filter.txt')
tuple val(id), file('blast_*.CHM13.v1.1_GRCh38.p13.chrY.txt')
tuple val(id), file('blast_*.GRCh38.decoy.hla_to_filter.txt') into filter_GRCH38_decoy_hla
tuple val(id), file('blast_*.GRCh38.p0.no.decoy.hla_to_filter.txt') into filter_GRCH38_p0
tuple val(id), file('blast_*.CHM13.v1.1_GRCh38.p13.chrY_to_filter.txt') into filter_CHM13_GRCH38

script:
"""
# Filter the fasta by the blast keep_list ------
seqkit grep -i -v -f ${remove_bl} ${filtered_kn} > ${id}_blst_kn_filtered.fasta
# Run blast (GRCh38) ------
blastn -db ${genome1} \
-query ${id}_blst_kn_filtered.fasta \
-query ${blast_kn_cdhit_filtered} \
-outfmt "6 qseqid sseqid stitle pident \
length evalue qcovs bitscore mismatch \
gapopen qstart qend qlen sstart send slen" \
Expand All @@ -328,13 +378,13 @@ process blastn {
qlen,sstart,send,slen" | tr ',' '\t'| cat - blast_${id}.GRCh38.decoy.hla0.txt > blast_${id}.GRCh38.decoy.hla.txt
rm blast_${id}.GRCh38.decoy.hla0.txt
# Filter the blast (GRCh38) ------
# Filter the blast (GRCh38.p0) ------
awk -F"\t" '(\$4 > ${params.blastr_filter_pident} && \$7 > ${params.blastr_filter_qcov})' \
blast_${id}.GRCh38.decoy.hla.txt > blast_${id}.GRCh38.decoy.hla_to_filter.txt
# Run blast (GRCh38.p0) ------
blastn -db ${genome2} \
-query ${id}_blst_kn_filtered.fasta \
-query ${blast_kn_cdhit_filtered} \
-outfmt "6 qseqid sseqid stitle pident \
length evalue qcovs bitscore mismatch \
gapopen qstart qend qlen sstart send slen" \
Expand All @@ -355,7 +405,7 @@ process blastn {
# Run blast (CHM13) ------
blastn -db ${genome3} \
-query ${id}_blst_kn_filtered.fasta \
-query ${blast_kn_cdhit_filtered} \
-outfmt "6 qseqid sseqid stitle pident \
length evalue qcovs bitscore mismatch \
gapopen qstart qend qlen sstart send slen" \
Expand All @@ -378,65 +428,39 @@ process blastn {
}

/*
STEP 5: RUN REPEAT MASKER ON FILTERED READS
process repeatmasker {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "RepeatMasker/4.1.2-p1-IGB-gcc-8.2.0-Perl-5.28.1"
publishDir "${resultsPath}/RepeatMasker/${params.assembler}/",mode:"copy"
input:
tuple val(id), file(filtered4) from filtered_file4
output:
file '*'
script:
"""
# Run Repeat Masker ------
RepeatMasker -species human ${filtered4} -nolow
"""
}
/*
STEP 6: RUN QUAST ON FILTERED READS
process quast {
STEP 5: FILTER FASTA FILE BASED ON BLAST STEP
*/
process final_filtering {
tag { id }
executor myExecutor
clusterOptions params.clusterAcct
cpus defaultCPU
queue params.myQueue
memory "$defaultMemory GB"
module "quast/5.0.0-IGB-gcc-4.9.4-Python-3.6.1"
publishDir "${resultsPath}/QUAST/${id}/",mode:"copy"
module "seqkit/0.12.1"
publishDir "${resultsPath}/Final-Filtered/${params.assembler}/",mode:"copy"

input:
tuple val(id), file(filtered5) from filtered_file5
tuple val(id), file(blast_kn_cdhit_filtered2) from blast_kn_cdhit2
tuple val(id), file(filter_GRCH38) from filter_GRCH38_decoy_hla
tuple val(id), file(filter_GRCH38p0) from filter_GRCH38_p0
tuple val(id), file(filter_CHM13) from filter_CHM13_GRCH38

output:
file '*'
tuple val(id), file('*_GRCH38_decoys_hla_Final_filter.fasta')
tuple val(id), file('*_GRCH38_p0_Final_filter.fasta')
tuple val(id), file('*_CHM13_Final_filter.fasta')

script:
"""
# Run QUAST ------
quast.py ${filtered5} \
--threads ${task.cpus}
# Filter the fasta using blast output (GRCh38) ------
seqkit grep -i -v -f ${filter_GRCH38} ${blast_kn_cdhit_filtered2} > ${id}_GRCH38_decoys_hla_Final_filter.fasta
# Filter the fasta using blast output (GRCh38.p0) ------
seqkit grep -i -v -f ${filter_GRCH38p0} ${blast_kn_cdhit_filtered2} > ${id}_GRCH38_p0_Final_filter.fasta
# Filter the fasta using blast output (CHM13) ------
seqkit grep -i -v -f ${filter_CHM13} ${blast_kn_cdhit_filtered2} > ${id}_CHM13_Final_filter.fasta
"""
}
*/

/*
LIST OF TOOLS USED IN THIS PIPLINE:
1. blastn --> BLAST+/2.10.1
2. seqkit --> seqkit/0.12.1
2. kraken2 --> Kraken2/2.0.8
4. repeatmasker --> RepeatMasker/4.1.2
5. quast --> quast/5.0.0
*/

}

0 comments on commit e770302

Please sign in to comment.