Skip to content

Commit

Permalink
#36 Comparing test data with RefGraph UIUC Pipeline files at various …
Browse files Browse the repository at this point in the history
…steps
  • Loading branch information
Negin Valizadegan committed Feb 23, 2022
1 parent cc16dc6 commit eef94d0
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 0 deletions.
208 changes: 208 additions & 0 deletions blast_compare_AFR.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#!/bin/bash

#SBATCH --mem 50G
#SBATCH --job-name compare-test
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL
#SBATCH --mail-type ALL
#SBATCH -n 2
#SBATCH -N 1
#SBATCH -A h3abionet
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out



### HPCBio: This compares test long read insertions with our files -----; Created by Negin Valizadegan Feb 19, 20222; [email protected]

##############################################################################
## ##
## GENERAL WRAPPER RELATED SCRIPTS ##
## ##
##############################################################################

# Set fancy fonts for the help message ------
NORM=`tput sgr0`
BOLD=`tput bold`
REV=`tput smso`

# Help ------
function HELP {
echo ""
echo "${BOLD}Help Documentation for the HPCBio UIUC RefGraph (Testing) Pipeline${NORM}"
echo ""
echo "The Following Options Must Be Specified:"
echo "${REV}-d${NORM} The full path to the results directory${NORM} (Required)"
echo "${REV}-s${NORM} Path to the subject fasta file [file to be tested against] (Required)"
echo "${REV}-q${NORM} Path to the query fasta file [file to be tested] (Required)"
echo "${REV}-i${NORM} percentage of identity for blast (Required)"
echo "${REV}-t${NORM} number of aligned sequences to keep in blast [max_target_seqs] (Required)"
echo "${REV}-p${NORM} maximum number of HSPs (alignments) to keep for any single query-subject pair in blast [max_hsps] (Required)"
echo "${REV}-e${NORM} e-value for blast (Required)"
echo "${REV}-h${NORM} Displays this help message without complaints (Optional)"
echo ""
echo "[ ${NORM}${BOLD}Example:${NORM} sbatch blast_compare_AFR.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/ \
-s /home/groups/h3abionet/RefGraph/results/2022-01-30-HGSVCv2-truthset/data/insertions.fasta \
-q assembly/Final-Assembly/masurca/AFR/merged_sequences_AFR_Final_Assembly.fasta \
-i 100 \
-t 5 \
-p 10 \
-e 1e-5 ]"
echo ""
echo "${BOLD}Other Example Queries:${NORM}"
echo "filter/SeqKit/masurca/AFR/merged_sequences_AFR_Seqkit_500.fasta;"
echo "filter/BLAST_Contam/masurca/AFR/merged_masurca_kn_filtered.fasta;"
echo "filter/BLAST_Contam/masurca/AFR/merged_masurca_blst_kn_filtered.fasta;"
echo "filter/CD-HIT/masurca/AFR/merged_masurca_blst_kn_cdhit.fasta;"
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_GRCH38_p0_filter.final.fasta;"
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_GRCH38_decoys_hla_filter.final.fasta"
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_CHM13_filter.final.fasta;"
echo "annotation/Merged_Reads/masurca/merged_sequences_GRCH38_p0.fasta;"m
echo "annotation/Merged_Reads/masurca/merged_sequences_GRCH38_decoys.fasta"
echo "annotation/Merged_Reads/masurca/merged_sequences_CHM13.fasta;"
echo "annotation/Cluster_CDHIT/masurca/clustered_GRCH38_p0.fasta;"
echo "annotation/Cluster_CDHIT/masurca/clustered_GRCH38_decoys.fasta"
echo "annotation/Cluster_CDHIT/masurca/clustered_CHM13.fasta"
echo ""
exit 1
}


# Check the number of arguments. If none are passed, print message and exit ------
NUMARGS=$#
if [ $NUMARGS -eq 0 ]; then
echo ""
echo "You Did Not Pass Any Arguments. Please Specify the Arguments Below:"
echo ""
HELP
fi


# Parse the inputs
while getopts :d:s:q:i:c:t:p:e:h FLAG; do
case $FLAG in
d) #set option "d"
OPT_d=$OPTARG
;;
s) #set option "s"
OPT_s=$OPTARG
;;
q) #set option "q"
OPT_q=$OPTARG
;;
i) #set option "i"
OPT_i=$OPTARG
;;
t) #set option "t"
OPT_t=$OPTARG
;;
p) #set option "p"
OPT_p=$OPTARG
;;
e) #set option "e"
OPT_e=$OPTARG
;;
h) #set option "h"
OPT_h=$OPTARG
HELP
;;
\?) #unrecognized option - show help
echo "Option -${BOLD}$OPTARG${NORM} not allowed."
exit 1
;;
esac
done


# Exit if necessary options are not passed ------
if [[ -z "$OPT_d" ]]; then
echo "No project directory specified, aborting script"
exit 1
fi

if [[ -z "$OPT_s" ]]; then
echo "No subject sequence path is specified, aborting script"
exit 1
fi

if [[ -z "$OPT_q" ]]; then
echo "No query sequence path is specified, aborting script"
exit 1
fi

if [[ -z "$OPT_i" ]]; then
echo "No percent identity is specified, aborting script"
exit 1
fi

if [[ -z "$OPT_t" ]]; then
echo "No max_target_seqs is specified, aborting script"
exit 1
fi

if [[ -z "$OPT_p" ]]; then
echo "No maximum number of HSPs is specified, aborting script"
exit 1
fi

if [[ -z "$OPT_e" ]]; then
echo "No e-value is specified, aborting script"
exit 1
fi

##############################################################################
## ##
## STEP 1: SET UP ##
## ##
##############################################################################

# Load nextflow ------
module load BLAST+/2.10.1-IGB-gcc-8.2.0

echo "blast module loaded"


# Set working directory -------
cd ${OPT_d}
mkdir -p pipeline_testing

echo "output directory is set"


##############################################################################
## ##
## STEP 2: Blast run ##
## ##
##############################################################################

# blast to compare two sequences ---
echo "blast is starting"
start=`date +%s` # capture start time

news=$(echo $(basename ${OPT_q}) | sed 's/.fasta//g')
blastn \
-query ${OPT_q} \
-subject ${OPT_s} \
-outfmt "6 qseqid sseqid stitle pident \
length evalue qcovs bitscore mismatch \
gapopen qstart qend qlen sstart send slen" \
-perc_identity ${OPT_i} \
-max_target_seqs ${OPT_t} \
-max_hsps ${OPT_p} \
-evalue ${OPT_e} \
-out pipeline_testing/${news}_blast_testdata.0.txt

echo "blast has ended"

# Create headers for the blast file -----
echo -e "qseqid,sseqid,stitle,pident,length,evalue,qcovs,bitscore,mismatch,gapopen,qstart,qend, \
qlen,sstart,send,slen" | tr ',' '\t'| cat - pipeline_testing/${news}_blast_testdata.0.txt > pipeline_testing/${news}_blast_testdata.txt

rm pipeline_testing/${news}_blast_testdata.0.txt

echo "headers are added to blast output"

end=`date +%s`
runtime=$((end-start))
runtime=$( echo "scale=2;$((end-start)) / 60" | bc )
echo "It took $runtime minutes to run maker on ${OPT_s}"


62 changes: 62 additions & 0 deletions compared_counts_AFR.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

#SBATCH --mem 10G
#SBATCH --job-name compare-test
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL
#SBATCH --mail-type ALL
#SBATCH -n 1
#SBATCH -N 1
#SBATCH -A h3abionet
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out



### HPCBio RefGraph Pipeline Testing Count: This compares test data set's long read insertions with our files; Created by Negin Valizadegan Feb 19, 2022; [email protected]


##############################################################################
## ##
## STEP 1: SET UP ##
## ##
##############################################################################

# Set working directory -------
cd /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/pipeline_testing

echo "output directory is set"


##############################################################################
## ##
## STEP 2: Count number of matches ##
## ##
##############################################################################


# Remove if the file exists -----
rm shared_sequences_pident100_qcovs100*.txt

# Count number of pident of 100 and qcov 100 for each file and put into a table ------
for i in *.txt
do
echo ${i} >> shared_sequences_pident100_qcovs100.0.txt
awk '($4 == 100)' ${i} | cut -f1 | uniq | wc -l >> shared_sequences_pident100_qcovs100.1.txt
awk '($4 == 100 && $7 == 100)' ${i} | cut -f1 | uniq | wc -l >> shared_sequences_pident100_qcovs100.2.txt
done

echo "numbers with pident == 100 and qcovs == 100 added"

# Put the two above into columns of a table -----
paste shared_sequences_pident100_qcovs100.0.txt shared_sequences_pident100_qcovs100.1.txt shared_sequences_pident100_qcovs100.2.txt | sort -r -k2,2 > shared_sequences_pident100_qcovs100.3.txt

# Create headers -----
echo -e "qseqid,pident100,pident100_qcovs100" | tr ',' '\t'| cat - shared_sequences_pident100_qcovs100.3.txt > shared_sequences_pident100_qcovs100.txt

# Remove extra files and view final file -----
rm shared_sequences_pident100_qcovs100.0.txt shared_sequences_pident100_qcovs100.1.txt shared_sequences_pident100_qcovs100.2.txt shared_sequences_pident100_qcovs100.3.txt
cat shared_sequences_pident100_qcovs100.txt





0 comments on commit eef94d0

Please sign in to comment.