-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#36 Comparing test data with RefGraph UIUC Pipeline files at various …
…steps
- Loading branch information
Negin Valizadegan
committed
Feb 23, 2022
1 parent
cc16dc6
commit eef94d0
Showing
2 changed files
with
270 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --mem 50G | ||
#SBATCH --job-name compare-test | ||
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL | ||
#SBATCH --mail-type ALL | ||
#SBATCH -n 2 | ||
#SBATCH -N 1 | ||
#SBATCH -A h3abionet | ||
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out | ||
|
||
|
||
|
||
### HPCBio: This compares test long read insertions with our files -----; Created by Negin Valizadegan Feb 19, 20222; [email protected] | ||
|
||
############################################################################## | ||
## ## | ||
## GENERAL WRAPPER RELATED SCRIPTS ## | ||
## ## | ||
############################################################################## | ||
|
||
# Set fancy fonts for the help message ------ | ||
NORM=`tput sgr0` | ||
BOLD=`tput bold` | ||
REV=`tput smso` | ||
|
||
# Help ------ | ||
function HELP { | ||
echo "" | ||
echo "${BOLD}Help Documentation for the HPCBio UIUC RefGraph (Testing) Pipeline${NORM}" | ||
echo "" | ||
echo "The Following Options Must Be Specified:" | ||
echo "${REV}-d${NORM} The full path to the results directory${NORM} (Required)" | ||
echo "${REV}-s${NORM} Path to the subject fasta file [file to be tested against] (Required)" | ||
echo "${REV}-q${NORM} Path to the query fasta file [file to be tested] (Required)" | ||
echo "${REV}-i${NORM} percentage of identity for blast (Required)" | ||
echo "${REV}-t${NORM} number of aligned sequences to keep in blast [max_target_seqs] (Required)" | ||
echo "${REV}-p${NORM} maximum number of HSPs (alignments) to keep for any single query-subject pair in blast [max_hsps] (Required)" | ||
echo "${REV}-e${NORM} e-value for blast (Required)" | ||
echo "${REV}-h${NORM} Displays this help message without complaints (Optional)" | ||
echo "" | ||
echo "[ ${NORM}${BOLD}Example:${NORM} sbatch blast_compare_AFR.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/ \ | ||
-s /home/groups/h3abionet/RefGraph/results/2022-01-30-HGSVCv2-truthset/data/insertions.fasta \ | ||
-q assembly/Final-Assembly/masurca/AFR/merged_sequences_AFR_Final_Assembly.fasta \ | ||
-i 100 \ | ||
-t 5 \ | ||
-p 10 \ | ||
-e 1e-5 ]" | ||
echo "" | ||
echo "${BOLD}Other Example Queries:${NORM}" | ||
echo "filter/SeqKit/masurca/AFR/merged_sequences_AFR_Seqkit_500.fasta;" | ||
echo "filter/BLAST_Contam/masurca/AFR/merged_masurca_kn_filtered.fasta;" | ||
echo "filter/BLAST_Contam/masurca/AFR/merged_masurca_blst_kn_filtered.fasta;" | ||
echo "filter/CD-HIT/masurca/AFR/merged_masurca_blst_kn_cdhit.fasta;" | ||
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_GRCH38_p0_filter.final.fasta;" | ||
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_GRCH38_decoys_hla_filter.final.fasta" | ||
echo "filter/Final-Filtered/masurca/AFR/merged_masurca_CHM13_filter.final.fasta;" | ||
echo "annotation/Merged_Reads/masurca/merged_sequences_GRCH38_p0.fasta;"m | ||
echo "annotation/Merged_Reads/masurca/merged_sequences_GRCH38_decoys.fasta" | ||
echo "annotation/Merged_Reads/masurca/merged_sequences_CHM13.fasta;" | ||
echo "annotation/Cluster_CDHIT/masurca/clustered_GRCH38_p0.fasta;" | ||
echo "annotation/Cluster_CDHIT/masurca/clustered_GRCH38_decoys.fasta" | ||
echo "annotation/Cluster_CDHIT/masurca/clustered_CHM13.fasta" | ||
echo "" | ||
exit 1 | ||
} | ||
|
||
|
||
# Check the number of arguments. If none are passed, print message and exit ------ | ||
NUMARGS=$# | ||
if [ $NUMARGS -eq 0 ]; then | ||
echo "" | ||
echo "You Did Not Pass Any Arguments. Please Specify the Arguments Below:" | ||
echo "" | ||
HELP | ||
fi | ||
|
||
|
||
# Parse the inputs | ||
while getopts :d:s:q:i:c:t:p:e:h FLAG; do | ||
case $FLAG in | ||
d) #set option "d" | ||
OPT_d=$OPTARG | ||
;; | ||
s) #set option "s" | ||
OPT_s=$OPTARG | ||
;; | ||
q) #set option "q" | ||
OPT_q=$OPTARG | ||
;; | ||
i) #set option "i" | ||
OPT_i=$OPTARG | ||
;; | ||
t) #set option "t" | ||
OPT_t=$OPTARG | ||
;; | ||
p) #set option "p" | ||
OPT_p=$OPTARG | ||
;; | ||
e) #set option "e" | ||
OPT_e=$OPTARG | ||
;; | ||
h) #set option "h" | ||
OPT_h=$OPTARG | ||
HELP | ||
;; | ||
\?) #unrecognized option - show help | ||
echo "Option -${BOLD}$OPTARG${NORM} not allowed." | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
|
||
# Exit if necessary options are not passed ------ | ||
if [[ -z "$OPT_d" ]]; then | ||
echo "No project directory specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_s" ]]; then | ||
echo "No subject sequence path is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_q" ]]; then | ||
echo "No query sequence path is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_i" ]]; then | ||
echo "No percent identity is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_t" ]]; then | ||
echo "No max_target_seqs is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_p" ]]; then | ||
echo "No maximum number of HSPs is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
if [[ -z "$OPT_e" ]]; then | ||
echo "No e-value is specified, aborting script" | ||
exit 1 | ||
fi | ||
|
||
############################################################################## | ||
## ## | ||
## STEP 1: SET UP ## | ||
## ## | ||
############################################################################## | ||
|
||
# Load nextflow ------ | ||
module load BLAST+/2.10.1-IGB-gcc-8.2.0 | ||
|
||
echo "blast module loaded" | ||
|
||
|
||
# Set working directory ------- | ||
cd ${OPT_d} | ||
mkdir -p pipeline_testing | ||
|
||
echo "output directory is set" | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 2: Blast run ## | ||
## ## | ||
############################################################################## | ||
|
||
# blast to compare two sequences --- | ||
echo "blast is starting" | ||
start=`date +%s` # capture start time | ||
|
||
news=$(echo $(basename ${OPT_q}) | sed 's/.fasta//g') | ||
blastn \ | ||
-query ${OPT_q} \ | ||
-subject ${OPT_s} \ | ||
-outfmt "6 qseqid sseqid stitle pident \ | ||
length evalue qcovs bitscore mismatch \ | ||
gapopen qstart qend qlen sstart send slen" \ | ||
-perc_identity ${OPT_i} \ | ||
-max_target_seqs ${OPT_t} \ | ||
-max_hsps ${OPT_p} \ | ||
-evalue ${OPT_e} \ | ||
-out pipeline_testing/${news}_blast_testdata.0.txt | ||
|
||
echo "blast has ended" | ||
|
||
# Create headers for the blast file ----- | ||
echo -e "qseqid,sseqid,stitle,pident,length,evalue,qcovs,bitscore,mismatch,gapopen,qstart,qend, \ | ||
qlen,sstart,send,slen" | tr ',' '\t'| cat - pipeline_testing/${news}_blast_testdata.0.txt > pipeline_testing/${news}_blast_testdata.txt | ||
|
||
rm pipeline_testing/${news}_blast_testdata.0.txt | ||
|
||
echo "headers are added to blast output" | ||
|
||
end=`date +%s` | ||
runtime=$((end-start)) | ||
runtime=$( echo "scale=2;$((end-start)) / 60" | bc ) | ||
echo "It took $runtime minutes to run maker on ${OPT_s}" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --mem 10G | ||
#SBATCH --job-name compare-test | ||
#SBATCH --mail-user [email protected] ## CHANGE THIS TO YOUR EMAIL | ||
#SBATCH --mail-type ALL | ||
#SBATCH -n 1 | ||
#SBATCH -N 1 | ||
#SBATCH -A h3abionet | ||
#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out | ||
|
||
|
||
|
||
### HPCBio RefGraph Pipeline Testing Count: This compares test data set's long read insertions with our files; Created by Negin Valizadegan Feb 19, 2022; [email protected] | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 1: SET UP ## | ||
## ## | ||
############################################################################## | ||
|
||
# Set working directory ------- | ||
cd /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/pipeline_testing | ||
|
||
echo "output directory is set" | ||
|
||
|
||
############################################################################## | ||
## ## | ||
## STEP 2: Count number of matches ## | ||
## ## | ||
############################################################################## | ||
|
||
|
||
# Remove if the file exists ----- | ||
rm shared_sequences_pident100_qcovs100*.txt | ||
|
||
# Count number of pident of 100 and qcov 100 for each file and put into a table ------ | ||
for i in *.txt | ||
do | ||
echo ${i} >> shared_sequences_pident100_qcovs100.0.txt | ||
awk '($4 == 100)' ${i} | cut -f1 | uniq | wc -l >> shared_sequences_pident100_qcovs100.1.txt | ||
awk '($4 == 100 && $7 == 100)' ${i} | cut -f1 | uniq | wc -l >> shared_sequences_pident100_qcovs100.2.txt | ||
done | ||
|
||
echo "numbers with pident == 100 and qcovs == 100 added" | ||
|
||
# Put the two above into columns of a table ----- | ||
paste shared_sequences_pident100_qcovs100.0.txt shared_sequences_pident100_qcovs100.1.txt shared_sequences_pident100_qcovs100.2.txt | sort -r -k2,2 > shared_sequences_pident100_qcovs100.3.txt | ||
|
||
# Create headers ----- | ||
echo -e "qseqid,pident100,pident100_qcovs100" | tr ',' '\t'| cat - shared_sequences_pident100_qcovs100.3.txt > shared_sequences_pident100_qcovs100.txt | ||
|
||
# Remove extra files and view final file ----- | ||
rm shared_sequences_pident100_qcovs100.0.txt shared_sequences_pident100_qcovs100.1.txt shared_sequences_pident100_qcovs100.2.txt shared_sequences_pident100_qcovs100.3.txt | ||
cat shared_sequences_pident100_qcovs100.txt | ||
|
||
|
||
|
||
|
||
|