Skip to content

Commit

Permalink
Added in a VcfFuncotationFactory. (broadinstitute#4593)
Browse files Browse the repository at this point in the history
* Added in a VcfFuncotationFactory.

Fixes broadinstitute#4357 

(in conjunction with a new data sources release)
  • Loading branch information
jonn-smith authored and cwhelan committed May 25, 2018
1 parent 585c69b commit 10c59ec
Show file tree
Hide file tree
Showing 23 changed files with 1,346 additions and 125 deletions.
278 changes: 278 additions & 0 deletions scripts/funcotator/data_sources/getDbSNP.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#!/usr/bin/env bash

################################################################################

# Creates a set of files that allows for creating annotations based off dbSNP
# Pulled directly from the ncbi FTP site.

################################################################################

#Setup variables for the script:
SCRIPTDIR="$( cd -P "$( dirname "$0" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=0
MAXARGS=0

FTP_BASE_URL='ftp://ftp.ncbi.nih.gov/snp/organisms/'
BUILD_NUMBER='150'

DATA_SOURCE_NAME="dbSNP"
OUT_DIR_NAME='dbsnp'

#SRC_FILE_BASE_NAME="All_"
SRC_FILE_BASE_NAME="common_all_"

SRC_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz\\s*\$"
MD5_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz.md5\\s*\$"
TBI_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz.tbi\\s*\$"

################################################################################

function simpleUsage()
{
echo -e "Usage: $SCRIPTNAME [OPTIONS] ..."
echo -e "Creates the data sources folder for dbSnp for the GATK Funcotator tool."
}

#Define a usage function:
function usage()
{
simpleUsage
echo -e ""
echo -e "Will download all data sources directly from the NCBI website:"
echo -e " ${FTP_BASE_URL}"
echo -e ""
echo -e "Return values:"
echo -e " 0 NORMAL"
echo -e " 1 TOO MANY ARGUMENTS"
echo -e " 2 TOO FEW ARGUMENTS"
echo -e " 3 UNKNOWN ARGUMENT"
echo -e " 4 BAD CHECKSUM"
echo -e " 5 OUTPUT DIRECTORY ALREADY EXISTS"
echo -e ""
}

#Display a message to std error:
function error()
{
echo "$1" 2>&1
}

TMPFILELIST=''
function makeTemp()
{
local f
f=$( mktemp )
TMPFILELIST="${TMPFILELIST} $f"
echo $f
}

function cleanTempVars()
{
rm -f ${TMPFILELIST}
}

function at_exit()
{
cleanTempVars
}

################################################################################


function createConfigFile() {

local dataSourceName=$1
local version=$2
local srcFile=$3
local originLocation=$4

echo "name = ${dataSourceName}"
echo "version = ${version}"
echo "src_file = ${srcFile}"
echo "origin_location = ${originLocation}"
echo "preprocessing_script = ${SCRIPTNAME}"
echo ""
echo "# Supported types:"
echo "# simpleXSV -- Arbitrary separated value table (e.g. CSV), keyed off Gene Name OR Transcript ID"
echo "# locatableXSV -- Arbitrary separated value table (e.g. CSV), keyed off a genome location"
echo "# gencode -- Custom datasource class for GENCODE"
echo "# cosmic -- Custom datasource class for COSMIC"
echo "# vcf -- Custom datasource class for Variant Call Format (VCF) files"
echo "type = vcf"
echo ""
echo "# Required field for GENCODE files."
echo "# Path to the FASTA file from which to load the sequences for GENCODE transcripts:"
echo "gencode_fasta_path ="
echo ""
echo "# Required field for simpleXSV files."
echo "# Valid values:"
echo "# GENE_NAME"
echo "# TRANSCRIPT_ID"
echo "xsv_key ="
echo ""
echo "# Required field for simpleXSV files."
echo "# The 0-based index of the column containing the key on which to match"
echo "xsv_key_column ="
echo ""
echo "# Required field for simpleXSV AND locatableXSV files."
echo "# The delimiter by which to split the XSV file into columns."
echo "xsv_delimiter ="
echo ""
echo "# Required field for simpleXSV files."
echo "# Whether to permissively match the number of columns in the header and data rows"
echo "# Valid values:"
echo "# true"
echo "# false"
echo "xsv_permissive_cols ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the contig for each row"
echo "contig_column ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the start position for each row"
echo "start_column ="
echo ""
echo "# Required field for locatableXSV files."
echo "# The 0-based index of the column containing the end position for each row"
echo "end_column ="
echo ""

}

function downloadAndVerifyVcfFiles() {

local remoteFolder=$1
local outputFolder=$2
local filePrefix=$3

local listingFile=$( makeTemp )
local indentSpace=" "
local version=$( echo "${remoteFolder}" | sed "s#.*human_\\(.*b${BUILD_NUMBER}\\).*#\\1#g" )

curl ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/ 2>/dev/null > ${listingFile}

vcfFile=$( cat ${listingFile} | awk '{print $9}' | grep -E ${SRC_FILE_REGEX} )
tbiFile=$( cat ${listingFile} | awk '{print $9}' | grep -E ${TBI_FILE_REGEX} )
md5File=$( cat ${listingFile} | awk '{print $9}' | grep -E ${MD5_FILE_REGEX} )

echo "${indentSpace}Retrieving MD5 sum file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File}

echo "${indentSpace}Retrieving VCF file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}

echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}

echo "${indentSpace}Verifying VCF checksum ..."
if [[ "$(uname)" == "Darwin" ]] ; then
which md5sum-lite &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution."
fi
else
which md5sum &> /dev/null
r=$?
if [ $r == 0 ] ; then
checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )

if [[ "${checksum}" != "${expected}" ]] ; then
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
error "FAILING"
exit 4
fi
else
error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution."
fi
fi

# Now put it in the right place and clean up:
echo "${indentSpace}Creating output directory ..."
mkdir -p ${outputFolder}

echo "${indentSpace}Moving files to output directory ..."
mv ${vcfFile} ${outputFolder}/${filePrefix}_${vcfFile}
mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
rm ${md5File}

echo "${indentSpace}Creating Config File ... "
createConfigFile "${DATA_SOURCE_NAME}" "${version}" ${filePrefix}_${vcfFile} "ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}" > ${outputFolder}/${DATA_SOURCE_NAME}.config
}

################################################################################

trap at_exit EXIT

################################################################################

#Check given arguments:
if [[ $# -gt $MAXARGS ]] ; then
usage
exit 1
elif [[ $# -lt $MINARGS ]] ; then
usage
exit 2
fi

################################################################################

#Read args:
while [ $# -gt 0 ] ; do

case "$1" in
-h|--h|--help|-help|help)
usage;
exit 0;
;;
*)
error "${SCRIPTNAME}: invalid option: $1"
simpleUsage
echo "Try \`$SCRIPTNAME --help' for more information."
exit 3;
;;
esac

#Get next argument in $1:
shift
done

################################################################################
# Do the work here:

# Make sure we don't have anything in our out folder already:
if [[ -d ${OUT_DIR_NAME} ]] ; then
error "Output directory already exists: ${OUT_DIR_NAME} - aborting!"
exit 5
fi

echo "Querying NCBI listing..."
tmpListing="$( makeTemp )"
curl ftp://ftp.ncbi.nih.gov/snp/organisms/ 2>/dev/null > ${tmpListing}

# Get the link for HG19:
echo "Processing HG19 ..."
hg19Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh37' | grep "b${BUILD_NUMBER}" )

downloadAndVerifyVcfFiles ${hg19Version} ${OUT_DIR_NAME}/hg19 "hg19"


# Get the link for HG38:
echo "Processing HG38 ..."
hg38Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh38' | grep "b${BUILD_NUMBER}" )

downloadAndVerifyVcfFiles ${hg38Version} ${OUT_DIR_NAME}/hg38 "hg38"

exit 0
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
Expand Down Expand Up @@ -812,7 +811,7 @@ protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(fi
final String name,
final Class<? extends Feature> featureType) {

final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(name + ":" + filePath);
final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(name + FeatureInput.FEATURE_ARGUMENT_TAG_DELIMITER + filePath);

//Add datasource to the feature manager too so that it can be queried. Setting lookahead to 0 to avoid caching.
//Note: we are disabling lookahead here because of windowed queries that need to "look behind" as well.
Expand Down
Loading

0 comments on commit 10c59ec

Please sign in to comment.