Skip to content

Commit

Permalink
Edited to make it appropriate for one sample, added --remove-gap to s…
Browse files Browse the repository at this point in the history
…eqkit and edited blastn + added restricting number of hits
  • Loading branch information
Negin Valizadegan committed Sep 9, 2021
1 parent 696cbcf commit 627872f
Showing 1 changed file with 155 additions and 47 deletions.
202 changes: 155 additions & 47 deletions annotation.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/bash

#SBATCH --mem 18G
#SBATCH --mem 70G
#SBATCH --job-name annotation
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL
#SBATCH --mail-type ALL
#SBATCH --output slurm-%j.out
#SBATCH -n 2
#SBATCH -n 1
#SBATCH -N 1
#SBATCH -A h3abionet
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out
Expand All @@ -14,8 +14,6 @@

# HPCBio UIUC Annotation pipeline; Created by Negin Valizadegan Sep 2, 2021; [email protected]



##############################################################################
## ##
## GENERAL WRAPPER RELATED SCRIPTS ##
Expand All @@ -32,15 +30,15 @@ function HELP {
echo "${BOLD}Help documentation for the HPCBio UIUC Annotation pipeline${NORM}"
echo ""
echo "The following options must be specified:"
echo "${REV}-d${NORM} The full path to the Final Assembly directory${NORM}"
echo "${REV}-d${NORM} The full path to the main results directory${NORM}"
#echo "${REV}-s${NORM} TRUE for single-end and FALSE for paired-end${NORM}"
#echo "${REV}-f${NORM} Adapter/primer sequence to be trimmed from R1${NORM}"
#echo "${REV}-r${NORM} Adapter/primer sequence to be trimmed from R2${NORM}"
#echo "${REV}-l${NORM} DADA2 truncation length for R1${NORM}"
#echo "${REV}-n${NORM} DADA2 truncation length for R2${NORM}"
echo "${REV}-h${NORM} Displays this help message without complaining about lack of arguments"
echo ""
echo "Example: sh annotation.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/assembly/Final-Assembly/"
echo "Example: sbatch annotation.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/"
exit 1
}

Expand Down Expand Up @@ -107,77 +105,185 @@ fi
# exit 1
#fi


##############################################################################
## ##
## STEP 1: LOAD THE NECESSARY MODULES ##
## ##
##############################################################################
setup ()
{

# ----- Load necessary modules ------

# Unload loaded modules -----
module purge

# Load what we need -----
module load seqkit/0.12.1
module load BLAST+/2.10.1-IGB-gcc-8.2.0

echo "Necessary modules are loaded"

}

##############################################################################
## ##
## STEP 1: SET UP DIRECTORY AND LOAD MODULES ##
## STEP 2: FILTER THE ASSEMBLY FILES ##
## ##
##############################################################################

setup()
filter ()
{
# Set working directory -----
# command + / for commenting
cd ${OPT_d}

# ----- Set working directory -----
cd ${OPT_d}/assembly/Final-Assembly/

echo "Input directory is set to" | tr '\n' ' ' && pwd

# Create a directory for outputs ----
mkdir -p ../../annotation/seqkit/masurca
mkdir -p ../../annotation/seqkit/megahit

# Load necessary modules ------
module purge
module load seqkit/0.12.1
# ----- Filter sequences < 500bp length -----
echo "Start of filtering process masurca"

# Masurca -----
cd masurca
echo "Now directory is set to" | tr '\n' ' ' && pwd

seqkit seq --min-len 500 --remove-gaps HG03563.masurca.final.fasta > ../../../annotation/seqkit/masurca/HG03563.filtered.fasta
echo "[Now creating stats on filtered fasta files]"
seqkit stats HG03563.masurca.final.fasta >> ../../../annotation/seqkit/masurca/HG03563.stats_before_filter.masurca.txt
seqkit stats ../../../annotation/seqkit/masurca/HG03563.filtered.fasta >> ../../../annotation/seqkit/masurca/HG03563.stats_after_filter.masurca.txt


echo "End of filtering process"

echo "Necessary modules are loaded"
}


##############################################################################
## ##
## STEP 2: FILTER THE ASSEMBLY FILES ##
## STEP 3: CREATE BLAST DATABASE (GRCh38) ##
## ##
##############################################################################

filter ()
blastdbGRCh38 ()
{

# Filter sequences < 500bp length -----
# ----- Set working directory -----
cd ${OPT_d}

echo "Input directory is set to" | tr '\n' ' ' && pwd

# ----- Creat Blast database from GRC38.decoy.hla -----
makeblastdb -in ../GRCh38 -parse_seqids -title "GRCh38.decoy.hla" -dbtype nucl

#Building a new DB, current time: 09/03/2021 18:47:07
#New DB name: /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa
#New DB title: GRCh38.decoy.hla
#Sequence type: Nucleotide
#Keep MBits: T
#Maximum file size: 1000000000B
#Adding sequences from FASTA; added 3366 sequences in 30.3125 seconds.

}


##############################################################################
## ##
## STEP 4: RUN BLAST ON FILTERED READS (GRCh38) ##
## ##
##############################################################################

blastGRCh38 ()
{

# ----- Run blastn on filtered sequences from previous step ------

# ----- Load necessary modules ------
module load BLAST+/2.10.1-IGB-gcc-8.2.0

# Set working directory -----
cd ${OPT_d}/annotation
echo "Directory is set to" | tr '\n' ' ' && pwd

# Make a directory for blast ------
mkdir -p blast/masurca
mkdir -p blast/megahit

echo "Start of BLASTN process"

# Masurca -----
cd masurca
cd seqkit/masurca

for i in *.masurca.final.fasta
do idi=$(echo $i | sed 's/.final.fasta//1')
echo "[Now filtering ${i}]"
seqkit seq --min-len 500 ${i} > ../../../annotation/seqkit/masurca/$idi.filtered.fasta
echo "[Now creating stats on filtered fasta files]"
seqkit stats ${i} >> ../../../annotation/seqkit/masurca/stats_before_filter.txt
seqkit stats ../../../annotation/seqkit/masurca/$idi.filtered.fasta >> ../../../annotation/seqkit/masurca/stats_after_filter.masurca.txt
done
# In order for the following codes to run properly without warning, you need to create a hidden file named .ncbirc and
# put it inside seqkit/masurca and seqkit/megahit
# Contents of this file (remove the # from each line):

#; Start the section for BLAST configuration
#[BLAST]
#; Specifies the path where BLAST databases are installed
#BLASTDB=/home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/GRCh38/

# Megahit -----
cd ../megahit
for j in *.megahit.final.fasta
do idj=$(echo $j | sed 's/.final.fasta//1')
echo "[Now filtering ${j}]"
seqkit seq --min-len 500 ${j} > ../../../annotation/seqkit/megahit/$idj.filtered.fasta
echo "[Now creating stats on filtered fasta files]"
seqkit stats ${j} >> ../../../annotation/seqkit/megahit/stats_before_filter.txt
seqkit stats ../../../annotation/seqkit/megahit/$idj.filtered.fasta >> ../../../annotation/seqkit/megahit/stats_after_filter.megahit.txt
done
# Run blast ------

# srun -A h3abionet -n 1 -N 1 --mem 20g -p lowmem --pty /bin/bash

echo "RUNNING HG03563.masurca.filtered.fasta"

start=`date +%s`
# Restrict number of hits -----
blastn -db ../../../../GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa \
-query HG03563.masurca.filtered.fasta\
-out ../../blast/masurca/blast_HG03563.masurca.filtered-limit.asn \
-outfmt 11 \
-max_target_seqs 50 \
-max_hsps 10 \
-evalue 1e-5 \
-perc_identity 90

echo "Convert blast archive .asn to tabular format"

# Reset to assembly directory -----
cd ..
echo "Directory is reset back to" | tr '\n' ' ' && pwd
blast_formatter -archive ../../blast/masurca/blast_HG03563.masurca.filtered-limit.asn \
-outfmt "7 qseqid sseqid stitle pident length evalue qcovs bitscore sblastnames mismatch gapopen qstart qend qlen sstart send slen" > ../../blast/masurca/blast_HG03563.masurca.filtered-limit.tsv

end=`date +%s`
runtime=$((end-start))
echo "$runtime"


start=`date +%s`
# Without limiting -----
blastn -db ../../../../GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa \
-query HG03563.masurca.filtered.fasta\
-out ../../blast/masurca/blast_HG03563.masurca.filtered.asn \
-outfmt 11 \
-evalue 1e-5 \
-perc_identity 90

echo "Convert blast archive .asn to tabular format"

blast_formatter -archive ../../blast/masurca/blast_HG03563.masurca.filtered.asn \
-outfmt "7 qseqid sseqid stitle pident length evalue qcovs bitscore sblastnames mismatch gapopen qstart qend qlen sstart send slen" > ../../blast/masurca/blast_HG03563.masurca.filtered.tsv

end=`date +%s`
runtime=$((end-start))
echo "$runtime"

# echo -e "qseqid,sseqid,stitle,pident,length,evalue,qcovs,bitscore,sblastnames,mismatch,gapopen,qstart,qend,qlen sstart send slen" | tr ',' '\t'| cat - ../blast/masurca/blast_HG03563.filtered.0.asn > ../blast/masurca/blast_HG03563.filtered.tsv
# rm ../../blast/masurca/blast_HG03563.filtered.0.tsv
echo "Blasted HG03563.filtered.fasta"

echo "End of BLASTN process"
}


##############################################################################
## ##
## MAIN ##
## ##
## ##
## MAIN ##
## ##
##############################################################################

# Main function runs each step/function of the pipeline separately so that
Expand All @@ -186,11 +292,13 @@ echo "Directory is reset back to" | tr '\n' ' ' && pwd
main()
{
# Determine whether running full pipeline or single step
runtype="FULL"
runtype="SINGLE STEP"

echo "*** RUNNING ${runtype} Annotation Pipeline ***"
setup
filter
echo "*** RUNNING ${runtype} ANNOTATION PIPELINE ***"
setup
#filter
#blastdbGRCh38
blastGRCh38
}

# Run main function
Expand Down

0 comments on commit 627872f

Please sign in to comment.