diff --git a/MAKER.sh b/MAKER.sh new file mode 100644 index 0000000..bfc1e31 --- /dev/null +++ b/MAKER.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +#SBATCH --mem 50G +#SBATCH --job-name maker +#SBATCH --mail-user valizad2@illinois.edu ## CHANGE THIS TO YOUR EMAIL +#SBATCH --mail-type ALL +#SBATCH -n 24 +#SBATCH -N 1 +#SBATCH -A h3abionet +#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%j.out + + + +# HPCBio UIUC Gene Annotation pipeline (MAKER + EVidenceModeler); Created by Negin Valizadegan Jan 18, 2022; valizad2@illinois.edu + +############################################################################## +## ## +## GENERAL WRAPPER RELATED SCRIPTS ## +## ## +############################################################################## + +# Set fancy fonts for the help message ------ +NORM=`tput sgr0` +BOLD=`tput bold` +REV=`tput smso` + +# Help ------ +function HELP { + echo "" + echo "${BOLD}Help Documentation for the HPCBio UIUC Annotation (Filtering) Pipeline${NORM}" + echo "" + echo "The Following Options Must Be Specified:" + echo "${REV}-d${NORM} The full path to the main results directory${NORM} (Required)" + echo "${REV}-s${NORM} The name of the input sequence (Required)" + echo "${REV}-h${NORM} Displays this help message without complaints (Optional)" + echo "" + echo "[ ${NORM}${BOLD}Example:${NORM} sbatch MAKER.sh -d /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/results/ -s clustered_GRCH38_p0.fasta ]" + echo "" + exit 1 +} + + +# Check the number of arguments. If none are passed, print message and exit ------ +NUMARGS=$# +if [ $NUMARGS -eq 0 ]; then + echo "" + echo "You Did Not Pass Any Arguments. Please Specify the Arguments Below:" + echo "" + HELP +fi + + +# Parse the inputs +while getopts :d:s:h FLAG; do + case $FLAG in + d) #set option "d" + OPT_d=$OPTARG + ;; + s) #set option "s" + OPT_s=$OPTARG + ;; + h) #set option "h" + OPT_h=$OPTARG + HELP + ;; + \?) #unrecognized option - show help + echo "Option -${BOLD}$OPTARG${NORM} not allowed." + exit 1 + ;; + esac +done + + +# Exit if necessary options are not passed ------ +if [[ -z "$OPT_d" ]]; then + echo "No project directory specified, aborting script" + exit 1 +fi + +if [[ -z "$OPT_s" ]]; then + echo "No input sequence name is specified, aborting script" + exit 1 +fi + +############################################################################## +## ## +## STEP 0: LOAD MODULES ## +## ## +############################################################################## + +setup () +{ + +# Load modules ------ +module load MAKER/3.01.03-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded + +# Create 3 control files needed for maker ----- (do not run if present; control files should be edited manually) +# cd ${OPT_d}/../HPCBio-Refgraph_pipeline/ +# maker -CITL + +echo "Control ctl files are created if not already exist. They are usually needed to be manually modified." + +} + + +############################################################################## +## ## +## STEP 1: RUN MAKER ## +## ## +############################################################################## + +maker () +{ + +# Set working directory ----- +cd ${OPT_d}/annotation + +# Create output directory +mkdir -p MAKER +cd MAKER + +echo "Working directory is set to" | tr '\n' ' ' && pwd + +# Create a temp directory ------ +mkdir -p /scratch/valizad2/maker # change valizad2 to your username + +start=`date +%s` # capture start time +echo "Start of maker annotation" + +#export AUGUSTUS_CONFIG_PATH=/home/n-z/valizad2/NeginV_Test_Summer2021/augustus/3.2.3-IGB-gcc-4.9.4/config export PATH=$PATH:=/home/n-z/valizad2/NeginV_Test_Summer2021/augustus/3.2.3-IGB-gcc-4.9.4/bin + +# Run maker ----- +mpiexec -n $SLURM_NPROCS maker \ +${OPT_d}/../HPCBio-Refgraph_pipeline/maker_opts.ctl \ +${OPT_d}/../HPCBio-Refgraph_pipeline/maker_bopts.ctl \ +${OPT_d}/../HPCBio-Refgraph_pipeline/maker_exe.ctl \ +-genome ${OPT_d}/annotation/Cluster_CDHIT/masurca/${OPT_s} \ +-fix_nucleotides # This will change Ys to Ns + +echo "Maker gene prediction is completed for ${OPT_s}" + +end=`date +%s` +runtime=$((end-start)) +runtime=$( echo "scale=2;$((end-start)) / 60" | bc ) +echo "It took $runtime minutes to run maker on ${OPT_s}" + +} + + +############################################################################## +## ## +## MAIN ## +## ## +############################################################################## + +# Main function runs each step/function of the pipeline separately so that +# user can choose to run steps one at a time. + +main () +{ + # Determine whether running full pipeline or single step + #runtype="PARTIAL" + runtype="FULL" + echo "" + echo "*** RUNNING ${runtype} ANNOTATION PIPELINE ***" + + setup + maker + + } + + +# Run main function +main \ No newline at end of file diff --git a/_Inline/.lock b/_Inline/.lock new file mode 100644 index 0000000..e69de29 diff --git a/_Inline/config-x86_64-linux-5.026001 b/_Inline/config-x86_64-linux-5.026001 new file mode 100644 index 0000000..b9be61b --- /dev/null +++ b/_Inline/config-x86_64-linux-5.026001 @@ -0,0 +1,14 @@ +version : 0.80 +languages : % + C : C + Foo : Foo + foo : Foo +types : % + C : compiled + Foo : interpreted +modules : % + C : Inline::C + Foo : Inline::Foo +suffixes : % + C : so + Foo : foo diff --git a/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.inl b/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.inl new file mode 100644 index 0000000..eec63e4 --- /dev/null +++ b/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.inl @@ -0,0 +1,22 @@ +md5 : 168b5562b2d3d613c6ee4dee4c45c915 +name : Bio::DB::IndexedBase_168b +version : "" +language : C +language_id : C +installed : 0 +date_compiled : Wed Jan 19 11:50:34 2022 +inline_version : 0.80 +ILSM : % + module : Inline::C + suffix : so + type : compiled +Config : % + apiversion : ? + archname : x86_64-linux + cc : gcc + ccflags : -O2 -march=x86-64 -mtune=generic -fPIC -fwrapv -fno-strict-aliasing -pipe -fstack-protector-strong -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_FORTIFY_SOURCE=2 + ld : gcc + osname : linux + osvers : 3.10.0-514.21.1.el7.x86_64 + so : so + version : 5.26.1 diff --git a/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.so b/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.so new file mode 100755 index 0000000..ce666f9 Binary files /dev/null and b/_Inline/lib/auto/Bio/DB/IndexedBase_168b/IndexedBase_168b.so differ diff --git a/annotation-config.conf b/annotation-config.conf index 9ebd768..1ccc975 100644 --- a/annotation-config.conf +++ b/annotation-config.conf @@ -8,7 +8,10 @@ params { genome1 = "./GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa" genome2 = "./GRCh38.p0/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" genome3 = "./CHM13.v1.1_GRCh38.p13.chrY/CHM13.v1.1_GRCh38.p13.chrY.fna" - samplePath = "./results/filter/Final-Filtered/masurca/*_filter.final.fasta" + samplePath = "./results/filter/Final-Filtered/masurca/test/*_filter.final.fasta" + samplePath1 = "./results/filter/Final-Filtered/masurca/test/*_GRCH38_decoys_hla_filter.final.fasta" + samplePath2 = "./results/filter/Final-Filtered/masurca/test/*_GRCH38_p0_filter.final.fasta" + samplePath3 = "./results/filter/Final-Filtered/masurca/test/*_CHM13_filter.final.fasta" myQueue = "normal" clusterAcct = " -A h3abionet " } diff --git a/annotation-run.sh b/annotation-run.sh new file mode 100644 index 0000000..a928bae --- /dev/null +++ b/annotation-run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +#SBATCH --mem 18G +#SBATCH --job-name annotation +#SBATCH --mail-user valizad2@illinois.edu ## CHANGE THIS TO YOUR EMAIL +#SBATCH --mail-type ALL +#SBATCH -n 2 +#SBATCH -N 1 +#SBATCH -A h3abionet +#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%A.out + +### This Runs Nextflow Annotation UIUC pipeline +## Date File Created: Dec 5, 2021 + + +# Set working directory ------- +cd /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021 + +# Load nextflow ------ +module load nextflow/21.04.1-Java-1.8.0_152 + +# Run nextflow UIUC workflow ----- +nextflow run HPCBio-Refgraph_pipeline/annotation.nf \ +-c HPCBio-Refgraph_pipeline/annotation-config.conf \ +-qs 3 -resume \ +-with-report nextflow_reports/nf_report.html \ +-with-timeline nextflow_reports/nf_timeline.html \ +-with-trace nextflow_reports/nf_trace.txt + +# -log custom.log #add this for log not hidden +# -q # Disable the printing of information to the terminal. + +# -with-report nf_exec_report_annotation.html \ +# -with-timeline nf_timeline_annotation.html \ +# -with-trace > nf_trace_annotation.txt \ # this is the same as slurm output, if you use this, slurm output will be empty +# -with-dag nf_flowchart_annotation.pdf + +#if [ echo "wc -l ${keep}" == echo "grep -E ">" ${id}_kn_filtered.fasta | wc -l" ] + # then + # echo "The filtering has not been done correctly. Please check your blastncontam script" + # fi + + + \ No newline at end of file diff --git a/annotation.nf b/annotation.nf index d612613..c0bfae4 100644 --- a/annotation.nf +++ b/annotation.nf @@ -8,10 +8,10 @@ */ /*Parameters that are specified at the command line or via config file*/ -params.genome1 = false /*genome fasta file GRCh38, must specify complete path. Required parameter*/ -params.genome2 = false /*genome fasta file GRCh38.p0, must specify complete path. Required parameter*/ -params.genome3 = false /*genome fasta file, CHM13, must specify complete path. Required parameter*/ params.samplePath = false /*input folder, must specify complete path. Required parameter*/ +params.samplePath1 = false /*input fasta files path for GRCh38 + decoy + alt, must specify complete path. Required parameter*/ +params.samplePath2 = false /*input fasta files path for GRCh38.p0, must specify complete path. Required parameter*/ +params.samplePath3 = false /*input fasta files path for CHM13, must specify complete path. Required parameter*/ /*Parameters to be used inside the pipeline */ params.outputDir = "./results" /*output folder, must specify path from current directory. Required parameter*/ @@ -34,24 +34,11 @@ defaultCPU = '9' defaultMemory = '120' params.clusterAcct = " -A h3bionet " -/*Prepare input*/ -genome_file1 = file(params.genome1) -genome_file2 = file(params.genome2) -genome_file3 = file(params.genome3) -genomeStore1 = genome_file1.getParent() -genomeStore2 = genome_file2.getParent() -genomeStore3 = genome_file3.getParent() - -// Sanity checks -if( !genome_file1.exists() ) exit 1, "Missing reference genome file: ${genome_file1}" -if( !genome_file2.exists() ) exit 1, "Missing reference genome file: ${genome_file2}" -if( !genome_file3.exists() ) exit 1, "Missing reference genome file: ${genome_file3}" -//if( params.assembler != "megahit" || params.assembler != "masurca" ) exit 1, "Unknown assembler: ${params.assembler}" - /* Create channcel for input files */ -filtered_fasta_Ch = Channel.fromFilePairs("${params.samplePath}", size: 1) -filtered_fasta_Ch2 = Channel.fromFilePairs("${params.samplePath}", size: 1) -filtered_fasta_Ch3 = Channel.fromFilePairs("${params.samplePath}", size: 1) +Channel.fromFilePairs("${params.samplePath}", size: 1).into { filtered_Ch;filtered_Ch2 } +filtered_GRCH38_decoys_Ch3 = Channel.fromFilePairs("${params.samplePath1}", size: 1) +filtered_GRCH38_p0_Ch3 = Channel.fromFilePairs("${params.samplePath2}", size: 1) +filtered_CHM13_Ch3 = Channel.fromFilePairs("${params.samplePath3}", size: 1) /* STEP 1: RUN REPEAT MASKER ON FILTERED READS @@ -67,7 +54,7 @@ process repeatmasker { publishDir "${resultsPath}/RepeatMasker/${params.assembler}/",mode:"copy" input: - tuple val(id), file(fasta) from filtered_fasta_Ch + tuple val(id), file(fasta) from filtered_Ch output: tuple val(id), file('*.fasta.cat') @@ -97,7 +84,7 @@ process quast { publishDir "${resultsPath}/QUAST/${id}/",mode:"copy" input: - tuple val(id), file(fasta2) from filtered_fasta_Ch2 + tuple val(id), file(fasta2) from filtered_Ch2 output: file '*' @@ -124,15 +111,21 @@ process merge_reads { publishDir "${resultsPath}/Merged_Reads/${params.assembler}/",mode:"copy" input: - tuple val(id), file(fasta3) from filtered_fasta_Ch3 + tuple val(id), file(fasta3) from filtered_GRCH38_decoys_Ch3 + tuple val(id), file(fasta33) from filtered_GRCH38_p0_Ch3 + tuple val(id), file(fasta333) from filtered_CHM13_Ch3 output: - file('merged_sequences.fasta') into merged_seqs + file('merged_sequences_GRCH38_decoys.fasta') into merged_seqs_GRCH38_decoys + file('merged_sequences_GRCH38_p0.fasta') into merged_seqs_GRCH38_p0 + file('merged_sequences_CHM13.fasta') into merged_seqs_CHM13 script: """ # Combine all sequences ------ - cat ${fasta3} >> merged_sequences.fasta + cat ${fasta3} >> merged_sequences_GRCH38_decoys.fasta + cat ${fasta33} >> merged_sequences_GRCH38_p0.fasta + cat ${fasta333} >> merged_sequences_CHM13.fasta """ } @@ -151,17 +144,39 @@ process cdhit { publishDir "${resultsPath}/Cluster_CDHIT/${params.assembler}/",mode:"copy" input: - file(merged) from merged_seqs + file merged_GRCH38_decoys from merged_seqs_GRCH38_decoys + file merged_GRCH38_p0 from merged_seqs_GRCH38_p0 + file merged_CHM13 from merged_seqs_CHM13 output: - file('clustered.fasta') into clustered + file('clustered_GRCH38_decoys.fasta') into clustered_GRCH38_decoys + file('clustered_GRCH38_p0.fasta') into clustered_GRCH38_p0 + file('clustered_CHM13.fasta') into clustered_CHM13 script: """ # Use cd-hit to cluster and remove redundancy ------ + + # GRCH38_decoys ----- + cd-hit-est \ + -i ${merged_GRCH38_decoys} \ + -o clustered_GRCH38_decoys.fasta \ + -c ${params.cdhit_identity} \ + -n ${params.cdhit_wordsize} \ + -T ${task.cpus} + + # GRCH38_p0 ----- + cd-hit-est \ + -i ${merged_GRCH38_p0} \ + -o clustered_GRCH38_p0.fasta \ + -c ${params.cdhit_identity} \ + -n ${params.cdhit_wordsize} \ + -T ${task.cpus} + + # CHM13 ----- cd-hit-est \ - -i ${merged} \ - -o clustered.fasta \ + -i ${merged_CHM13} \ + -o clustered_CHM13.fasta \ -c ${params.cdhit_identity} \ -n ${params.cdhit_wordsize} \ -T ${task.cpus} diff --git a/assembly-run.sh b/assembly-run.sh new file mode 100644 index 0000000..805ba9e --- /dev/null +++ b/assembly-run.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#SBATCH --mem 18G +#SBATCH --job-name assembly +#SBATCH --mail-user valizad2@illinois.edu ## CHANGE THIS TO YOUR EMAIL +#SBATCH --mail-type ALL +#SBATCH --output slurm-%j.out +#SBATCH -n 2 +#SBATCH -N 1 +#SBATCH -A h3abionet + + +### This Runs Nextflow assembly UIUC pipeline +## Date File Created: July 28th, 2021 + + +# Set working directory ------- +cd /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021 + +# Load nextflow ------ +module load nextflow/21.04.1-Java-1.8.0_152 + +# Run nextflow UIUC workflow ----- +nextflow run HPCBio-Refgraph_pipeline/assemble.nf -c HPCBio-Refgraph_pipeline/test-config.conf -qs 1 -resume + +# -log custom.log #add this for log not hidden + +# -q # Disable the printing of information to the terminal. + +# Put stderr and stdout into a log ------ +2>&1 | tee ./assembly.log + + + + diff --git a/filter-run.sh b/filter-run.sh new file mode 100644 index 0000000..45faf44 --- /dev/null +++ b/filter-run.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +#SBATCH --mem 18G +#SBATCH --job-name filtering +#SBATCH --mail-user valizad2@illinois.edu ## CHANGE THIS TO YOUR EMAIL +#SBATCH --mail-type ALL +#SBATCH -n 2 +#SBATCH -N 1 +#SBATCH -A h3abionet +#SBATCH -o /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021/slurm_output/slurm-%A.out + +### This Runs Nextflow Filtering UIUC pipeline +## Date File Created: Nov 1, 2021 + + +# Set working directory ------- +cd /home/groups/h3abionet/RefGraph/results/NeginV_Test_Summer2021 + +# Load nextflow ------ +module load nextflow/21.04.1-Java-1.8.0_152 + +# Run nextflow UIUC workflow ----- +nextflow run HPCBio-Refgraph_pipeline/filter.nf \ +-c HPCBio-Refgraph_pipeline/filter-config.conf \ +-qs 3 -resume \ +-with-report nextflow_reports/nf_report.html \ + -with-timeline nextflow_reports/nf_timeline.html \ + -with-trace nextflow_reports/nf_trace.txt + +# -log custom.log #add this for log not hidden +# -q # Disable the printing of information to the terminal. + +# -with-report nf_exec_report_annotation.html \ +# -with-timeline nf_timeline_annotation.html \ +# -with-trace > nf_trace_annotation.txt \ # this is the same as slurm output, if you use this, slurm output will be empty +# -with-dag nf_flowchart_annotation.pdf + + + +#if [ echo "wc -l ${keep}" == echo "grep -E ">" ${id}_kn_filtered.fasta | wc -l" ] + # then + # echo "The filtering has not been done correctly. Please check your blastncontam script" + # fi + + + \ No newline at end of file diff --git a/filter.nf b/filter.nf index b95c6cf..24894b5 100644 --- a/filter.nf +++ b/filter.nf @@ -29,7 +29,7 @@ params.max_target_seqs = '5' /*number of aligned sequences to params.max_hsps = '10' /*maximum number of HSPs (alignments) to keep for any single query-subject pair in blast. Default is 10*/ params.evalue = '1e-5' /*expect value (E) for saving hits in blast. Default is 1e-5*/ params.blastnt_pident = '60' /*percentage of identical matches in blast NT. Default is 60*/ -params.blastr_pident = '90' /*percentage of identical matches in blast huma reference. Default is 90*/ +params.blastr_pident = '90' /*percentage of identical matches in blast human reference. Default is 90*/ params.blastnt_filter_pident = '60' /*filtering cut off for percentage of identical matches from blast NT. Default is 60*/ params.blastnt_filter_length = '100' /*filtering cut off for alignment length from blast NT. Default is 100*/ params.blastr_filter_pident = '95' /*filtering cut off for percentage of identical matches from blast ref genome. Default is 95*/ @@ -95,6 +95,8 @@ process blastdbGRCh38 { script: """ + start=`date +%s` # capture start time + echo "Start of maker annotation" makeblastdb -in ${genome1} -parse_seqids -title "GRCh38.decoy.hla" -dbtype nucl """ @@ -395,7 +397,6 @@ process blastref { -max_target_seqs ${params.max_target_seqs} \ -max_hsps ${params.max_hsps} \ -evalue ${params.evalue} \ - -perc_identity ${params.blastr_pident} \ -num_threads ${task.cpus} diff --git a/filter.sh b/filter.sh index 1158751..d6456d4 100644 --- a/filter.sh +++ b/filter.sh @@ -89,7 +89,7 @@ startall=`date +%s` # record start time ############################################################################## ## ## -## STEP 1: LOAD THE NECESSARY MODULES ## +## STEP 1: SET UP ## ## ## ############################################################################## diff --git a/maker_bopts.ctl b/maker_bopts.ctl new file mode 100644 index 0000000..fd9e757 --- /dev/null +++ b/maker_bopts.ctl @@ -0,0 +1,29 @@ +#-----BLAST and Exonerate Statistics Thresholds +blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast' +use_rapsearch=0 #use rapsearch instead of blastx, 1 = yes, 0 = no + +pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments +pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments +eval_blastn=1e-10 #Blastn eval cutoff +bit_blastn=40 #Blastn bit cutoff +depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff) + +pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments +pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments +eval_blastx=1e-06 #Blastx eval cutoff +bit_blastx=30 #Blastx bit cutoff +depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff) + +pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments +pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments +eval_tblastx=1e-10 #tBlastx eval cutoff +bit_tblastx=40 #tBlastx bit cutoff +depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff) + +pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking +pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking +eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking +bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking + +ep_score_limit=20 #Exonerate protein percent of maximal score threshold +en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold diff --git a/maker_evm.ctl b/maker_evm.ctl new file mode 100644 index 0000000..b0027d9 --- /dev/null +++ b/maker_evm.ctl @@ -0,0 +1,18 @@ +#-----Transcript weights +evmtrans=10 #default weight for source unspecified est/alt_est alignments +evmtrans:blastn=0 #weight for blastn sourced alignments +evmtrans:est2genome=10 #weight for est2genome sourced alignments +evmtrans:tblastx=0 #weight for tblastx sourced alignments +evmtrans:cdna2genome=7 #weight for cdna2genome sourced alignments + +#-----Protein weights +evmprot=10 #default weight for source unspecified protein alignments +evmprot:blastx=2 #weight for blastx sourced alignments +evmprot:protein2genome=10 #weight for protein2genome sourced alignments + +#-----Abinitio Prediction weights +evmab=10 #default weight for source unspecified ab initio predictions +evmab:snap=10 #weight for snap sourced predictions +evmab:augustus=10 #weight for augustus sourced predictions +evmab:fgenesh=10 #weight for fgenesh sourced predictions +evmab:genemark=7 #weight for genemark sourced predictions diff --git a/maker_exe.ctl b/maker_exe.ctl new file mode 100644 index 0000000..a772458 --- /dev/null +++ b/maker_exe.ctl @@ -0,0 +1,26 @@ +#-----Location of Executables Used by MAKER/EVALUATOR +makeblastdb=/home/apps/software/RMBlast/2.6.0-IGB-gcc-4.9.4/bin/makeblastdb #location of NCBI+ makeblastdb executable +blastn=/home/apps/software/RMBlast/2.6.0-IGB-gcc-4.9.4/bin/blastn #location of NCBI+ blastn executable +blastx=/home/apps/software/RMBlast/2.6.0-IGB-gcc-4.9.4/bin/blastx #location of NCBI+ blastx executable +tblastx=/home/apps/software/RMBlast/2.6.0-IGB-gcc-4.9.4/bin/tblastx #location of NCBI+ tblastx executable +formatdb= #location of NCBI formatdb executable +blastall= #location of NCBI blastall executable +xdformat= #location of WUBLAST xdformat executable +blasta= #location of WUBLAST blasta executable +prerapsearch= #location of prerapsearch executable +rapsearch= #location of rapsearch executable +RepeatMasker=/home/apps/software/RepeatMasker/4.0.7-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/RepeatMasker #location of RepeatMasker executable +exonerate=/home/apps/software/exonerate/2.2.0-IGB-gcc-4.9.4/bin/exonerate #location of exonerate executable + +#-----Ab-initio Gene Prediction Algorithms +snap=/home/apps/software/SNAP/2013-11-29-IGB-gcc-4.9.4/snap #location of snap executable +gmhmme3=/home/apps/software/GeneMark-ES/4.33-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/gmhmme3 #location of eukaryotic genemark executable +gmhmmp=/home/apps/software/GeneMarkS/4.30-IGB-gcc-4.9.4/gmhmmp #location of prokaryotic genemark executable +augustus=/home/n-z/valizad2/NeginV_Test_Summer2021/augustus/3.2.3-IGB-gcc-4.9.4/bin/augustus #location of augustus executable +fgenesh= #location of fgenesh executable +evm=/home/apps/software/EVidenceModeler/1.1.1-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/evidence_modeler.pl #location of EvidenceModeler executable +tRNAscan-SE=/home/apps/software/tRNAscan-SE/1.3.1-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/bin/tRNAscan-SE #location of trnascan executable +snoscan=/home/apps/software/snoscan/0.9.1-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/snoscan #location of snoscan executable + +#-----Other Algorithms +probuild=/home/apps/software/GeneMark-ES/4.33-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/probuild #location of probuild executable (required for genemark) diff --git a/maker_opts.ctl b/maker_opts.ctl new file mode 100644 index 0000000..256156c --- /dev/null +++ b/maker_opts.ctl @@ -0,0 +1,78 @@ +#-----Genome (these are always required) +genome= #genome sequence (fasta file or fasta embeded in GFF3 file) +organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic + +#-----Re-annotation Using MAKER Derived GFF3 +maker_gff= #MAKER derived GFF3 file +est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no +altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no +protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no +rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no +model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no +pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no +other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no + +#-----EST Evidence (for best results provide a file for at least one) +est=/home/n-z/valizad2/NeginV_Test_Summer2021/human_maker_inputs/human-est.fa,/home/n-z/valizad2/NeginV_Test_Summer2021/human_maker_inputs/human-mrna.fa #set of ESTs or assembled mRNA-seq in fasta format +altest= #EST/cDNA sequence file in fasta format from an alternate organism +est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file +altest_gff= #aligned ESTs from a closly relate species in GFF3 format + +#-----Protein Homology Evidence (for best results provide a file for at least one) +protein=/home/n-z/valizad2/NeginV_Test_Summer2021/human_maker_inputs/human-protein.fa #protein sequence file in fasta format (i.e. from mutiple organisms) +protein_gff= #aligned protein homology evidence from an external GFF3 file + +#-----Repeat Masking (leave values blank to skip repeat masking) +model_org=Human #select a model organism for RepBase masking in RepeatMasker +rmlib= #provide an organism specific repeat library in fasta format for RepeatMasker +repeat_protein=/home/apps/software/MAKER/3.01.03-IGB-gcc-4.9.4-Perl-5.26.1-unthreaded/data/te_proteins.fasta #provide a fasta file of transposable element proteins for RepeatRunner +rm_gff= #pre-identified repeat elements from an external GFF3 file +prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no +softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering) + +#-----Gene Prediction +snaphmm=/home/apps/software/SNAP/2013-11-29-IGB-gcc-4.9.4/HMM/mam54-ro.hmm #SNAP HMM file +gmhmm= #GeneMark HMM file +augustus_species=human #Augustus gene prediction species model +fgenesh_par_file= #FGENESH parameter file +pred_gff= #ab-initio predictions from an external GFF3 file +model_gff= #annotated gene models from an external GFF3 file (annotation pass-through) +run_evm=1 #run EvidenceModeler, 1 = yes, 0 = no +est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no +protein2genome=1 #infer predictions from protein homology, 1 = yes, 0 = no +trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no +snoscan_rrna= #rRNA file to have Snoscan find snoRNAs +snoscan_meth= #-O-methylation site fileto have Snoscan find snoRNAs +unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no +allow_overlap= #allowed gene overlap fraction (value from 0 to 1, blank for default) + +#-----Other Annotation Feature Types (features MAKER doesn't recognize) +other_gff= #extra features to pass-through to final MAKER generated GFF3 file + +#-----External Application Behavior Options +alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases +cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI) + +#-----MAKER Behavior Options +max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage) +min_contig=1 #skip genome contigs below this length (under 10kb are often useless) + +pred_flank=200 #flank for extending evidence clusters sent to gene predictors +pred_stats=0 #report AED and QI statistics for all predictions as well as models +AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1) +min_protein=0 #require at least this many amino acids in predicted proteins +alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no +always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no +map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no +keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1) + +split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments) +min_intron=20 #minimum intron length (used for alignment polishing) +single_exon=1 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no +single_length=250 #min length required for single exon ESTs if 'single_exon is enabled' +correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes + +tries=5 #number of times to try a contig if there is a failure for some reason +clean_try=1 #remove all data from previous run before retrying, 1 = yes, 0 = no +clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no +TMP=/scratch/valizad2/maker #specify a directory other than the system default temporary directory for temporary files