-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Edited to make it appropriate for one sample, added --remove-gap to s…
…eqkit and edited blastn + added restricting number of hits
- Loading branch information
Negin Valizadegan
committed
Sep 9, 2021
1 parent
696cbcf
commit 627872f
Showing
1 changed file
with
155 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --mem 18G | ||
#SBATCH --mem 70G | ||
#SBATCH --job-name annotation | ||
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL | ||
#SBATCH --mail-type ALL | ||
#SBATCH --output slurm-%j.out | ||
#SBATCH -n 2 | ||
#SBATCH -n 1 | ||
#SBATCH -N 1 | ||
#SBATCH -A h3abionet | ||
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out | ||
|
@@ -14,8 +14,6 @@ | |
|
||
# HPCBio UIUC Annotation pipeline; Created by Negin Valizadegan Sep 2, 2021; [email protected] | ||
|
||
|
||
|
||
############################################################################## | ||
## ## | ||
## GENERAL WRAPPER RELATED SCRIPTS ## | ||
|
@@ -32,15 +30,15 @@ function HELP { | |
echo "${BOLD}Help documentation for the HPCBio UIUC Annotation pipeline${NORM}" | ||
echo "" | ||
echo "The following options must be specified:" | ||
echo "${REV}-d${NORM} The full path to the Final Assembly directory${NORM}" | ||
echo "${REV}-d${NORM} The full path to the main results directory${NORM}" | ||
#echo "${REV}-s${NORM} TRUE for single-end and FALSE for paired-end${NORM}" | ||
#echo "${REV}-f${NORM} Adapter/primer sequence to be trimmed from R1${NORM}" | ||
#echo "${REV}-r${NORM} Adapter/primer sequence to be trimmed from R2${NORM}" | ||
#echo "${REV}-l${NORM} DADA2 truncation length for R1${NORM}" | ||
#echo "${REV}-n${NORM} DADA2 truncation length for R2${NORM}" | ||
echo "${REV}-h${NORM} Displays this help message without complaining about lack of arguments" | ||
echo "" | ||
echo "Example: sh annotation.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/assembly/Final-Assembly/" | ||
echo "Example: sbatch annotation.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/" | ||
exit 1 | ||
} | ||
|
||
|
@@ -107,77 +105,185 @@ fi | |
# exit 1 | ||
#fi | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 1: LOAD THE NECESSARY MODULES ## | ||
## ## | ||
############################################################################## | ||
setup () | ||
{ | ||
|
||
# ----- Load necessary modules ------ | ||
|
||
# Unload loaded modules ----- | ||
module purge | ||
|
||
# Load what we need ----- | ||
module load seqkit/0.12.1 | ||
module load BLAST+/2.10.1-IGB-gcc-8.2.0 | ||
|
||
echo "Necessary modules are loaded" | ||
|
||
} | ||
|
||
############################################################################## | ||
## ## | ||
## STEP 1: SET UP DIRECTORY AND LOAD MODULES ## | ||
## STEP 2: FILTER THE ASSEMBLY FILES ## | ||
## ## | ||
############################################################################## | ||
|
||
setup() | ||
filter () | ||
{ | ||
# Set working directory ----- | ||
# command + / for commenting | ||
cd ${OPT_d} | ||
|
||
# ----- Set working directory ----- | ||
cd ${OPT_d}/assembly/Final-Assembly/ | ||
|
||
echo "Input directory is set to" | tr '\n' ' ' && pwd | ||
|
||
# Create a directory for outputs ---- | ||
mkdir -p ../../annotation/seqkit/masurca | ||
mkdir -p ../../annotation/seqkit/megahit | ||
|
||
# Load necessary modules ------ | ||
module purge | ||
module load seqkit/0.12.1 | ||
# ----- Filter sequences < 500bp length ----- | ||
echo "Start of filtering process masurca" | ||
|
||
# Masurca ----- | ||
cd masurca | ||
echo "Now directory is set to" | tr '\n' ' ' && pwd | ||
|
||
seqkit seq --min-len 500 --remove-gaps HG03563.masurca.final.fasta > ../../../annotation/seqkit/masurca/HG03563.filtered.fasta | ||
echo "[Now creating stats on filtered fasta files]" | ||
seqkit stats HG03563.masurca.final.fasta >> ../../../annotation/seqkit/masurca/HG03563.stats_before_filter.masurca.txt | ||
seqkit stats ../../../annotation/seqkit/masurca/HG03563.filtered.fasta >> ../../../annotation/seqkit/masurca/HG03563.stats_after_filter.masurca.txt | ||
|
||
|
||
echo "End of filtering process" | ||
|
||
echo "Necessary modules are loaded" | ||
} | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 2: FILTER THE ASSEMBLY FILES ## | ||
## STEP 3: CREATE BLAST DATABASE (GRCh38) ## | ||
## ## | ||
############################################################################## | ||
|
||
filter () | ||
blastdbGRCh38 () | ||
{ | ||
|
||
# Filter sequences < 500bp length ----- | ||
# ----- Set working directory ----- | ||
cd ${OPT_d} | ||
|
||
echo "Input directory is set to" | tr '\n' ' ' && pwd | ||
|
||
# ----- Creat Blast database from GRC38.decoy.hla ----- | ||
makeblastdb -in ../GRCh38 -parse_seqids -title "GRCh38.decoy.hla" -dbtype nucl | ||
|
||
#Building a new DB, current time: 09/03/2021 18:47:07 | ||
#New DB name: /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa | ||
#New DB title: GRCh38.decoy.hla | ||
#Sequence type: Nucleotide | ||
#Keep MBits: T | ||
#Maximum file size: 1000000000B | ||
#Adding sequences from FASTA; added 3366 sequences in 30.3125 seconds. | ||
|
||
} | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 4: RUN BLAST ON FILTERED READS (GRCh38) ## | ||
## ## | ||
############################################################################## | ||
|
||
blastGRCh38 () | ||
{ | ||
|
||
# ----- Run blastn on filtered sequences from previous step ------ | ||
|
||
# ----- Load necessary modules ------ | ||
module load BLAST+/2.10.1-IGB-gcc-8.2.0 | ||
|
||
# Set working directory ----- | ||
cd ${OPT_d}/annotation | ||
echo "Directory is set to" | tr '\n' ' ' && pwd | ||
|
||
# Make a directory for blast ------ | ||
mkdir -p blast/masurca | ||
mkdir -p blast/megahit | ||
|
||
echo "Start of BLASTN process" | ||
|
||
# Masurca ----- | ||
cd masurca | ||
cd seqkit/masurca | ||
|
||
for i in *.masurca.final.fasta | ||
do idi=$(echo $i | sed 's/.final.fasta//1') | ||
echo "[Now filtering ${i}]" | ||
seqkit seq --min-len 500 ${i} > ../../../annotation/seqkit/masurca/$idi.filtered.fasta | ||
echo "[Now creating stats on filtered fasta files]" | ||
seqkit stats ${i} >> ../../../annotation/seqkit/masurca/stats_before_filter.txt | ||
seqkit stats ../../../annotation/seqkit/masurca/$idi.filtered.fasta >> ../../../annotation/seqkit/masurca/stats_after_filter.masurca.txt | ||
done | ||
# In order for the following codes to run properly without warning, you need to create a hidden file named .ncbirc and | ||
# put it inside seqkit/masurca and seqkit/megahit | ||
# Contents of this file (remove the # from each line): | ||
|
||
#; Start the section for BLAST configuration | ||
#[BLAST] | ||
#; Specifies the path where BLAST databases are installed | ||
#BLASTDB=/home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/GRCh38/ | ||
|
||
# Megahit ----- | ||
cd ../megahit | ||
for j in *.megahit.final.fasta | ||
do idj=$(echo $j | sed 's/.final.fasta//1') | ||
echo "[Now filtering ${j}]" | ||
seqkit seq --min-len 500 ${j} > ../../../annotation/seqkit/megahit/$idj.filtered.fasta | ||
echo "[Now creating stats on filtered fasta files]" | ||
seqkit stats ${j} >> ../../../annotation/seqkit/megahit/stats_before_filter.txt | ||
seqkit stats ../../../annotation/seqkit/megahit/$idj.filtered.fasta >> ../../../annotation/seqkit/megahit/stats_after_filter.megahit.txt | ||
done | ||
# Run blast ------ | ||
|
||
# srun -A h3abionet -n 1 -N 1 --mem 20g -p lowmem --pty /bin/bash | ||
|
||
echo "RUNNING HG03563.masurca.filtered.fasta" | ||
|
||
start=`date +%s` | ||
# Restrict number of hits ----- | ||
blastn -db ../../../../GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa \ | ||
-query HG03563.masurca.filtered.fasta\ | ||
-out ../../blast/masurca/blast_HG03563.masurca.filtered-limit.asn \ | ||
-outfmt 11 \ | ||
-max_target_seqs 50 \ | ||
-max_hsps 10 \ | ||
-evalue 1e-5 \ | ||
-perc_identity 90 | ||
|
||
echo "Convert blast archive .asn to tabular format" | ||
|
||
# Reset to assembly directory ----- | ||
cd .. | ||
echo "Directory is reset back to" | tr '\n' ' ' && pwd | ||
blast_formatter -archive ../../blast/masurca/blast_HG03563.masurca.filtered-limit.asn \ | ||
-outfmt "7 qseqid sseqid stitle pident length evalue qcovs bitscore sblastnames mismatch gapopen qstart qend qlen sstart send slen" > ../../blast/masurca/blast_HG03563.masurca.filtered-limit.tsv | ||
|
||
end=`date +%s` | ||
runtime=$((end-start)) | ||
echo "$runtime" | ||
|
||
|
||
start=`date +%s` | ||
# Without limiting ----- | ||
blastn -db ../../../../GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa \ | ||
-query HG03563.masurca.filtered.fasta\ | ||
-out ../../blast/masurca/blast_HG03563.masurca.filtered.asn \ | ||
-outfmt 11 \ | ||
-evalue 1e-5 \ | ||
-perc_identity 90 | ||
|
||
echo "Convert blast archive .asn to tabular format" | ||
|
||
blast_formatter -archive ../../blast/masurca/blast_HG03563.masurca.filtered.asn \ | ||
-outfmt "7 qseqid sseqid stitle pident length evalue qcovs bitscore sblastnames mismatch gapopen qstart qend qlen sstart send slen" > ../../blast/masurca/blast_HG03563.masurca.filtered.tsv | ||
|
||
end=`date +%s` | ||
runtime=$((end-start)) | ||
echo "$runtime" | ||
|
||
# echo -e "qseqid,sseqid,stitle,pident,length,evalue,qcovs,bitscore,sblastnames,mismatch,gapopen,qstart,qend,qlen sstart send slen" | tr ',' '\t'| cat - ../blast/masurca/blast_HG03563.filtered.0.asn > ../blast/masurca/blast_HG03563.filtered.tsv | ||
# rm ../../blast/masurca/blast_HG03563.filtered.0.tsv | ||
echo "Blasted HG03563.filtered.fasta" | ||
|
||
echo "End of BLASTN process" | ||
} | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## MAIN ## | ||
## ## | ||
## ## | ||
## MAIN ## | ||
## ## | ||
############################################################################## | ||
|
||
# Main function runs each step/function of the pipeline separately so that | ||
|
@@ -186,11 +292,13 @@ echo "Directory is reset back to" | tr '\n' ' ' && pwd | |
main() | ||
{ | ||
# Determine whether running full pipeline or single step | ||
runtype="FULL" | ||
runtype="SINGLE STEP" | ||
|
||
echo "*** RUNNING ${runtype} Annotation Pipeline ***" | ||
setup | ||
filter | ||
echo "*** RUNNING ${runtype} ANNOTATION PIPELINE ***" | ||
setup | ||
#filter | ||
#blastdbGRCh38 | ||
blastGRCh38 | ||
} | ||
|
||
# Run main function | ||
|