Added in a VcfFuncotationFactory. (broadinstitute#4593)

* Added in a VcfFuncotationFactory. Fixes broadinstitute#4357 (in conjunction with a new data sources release)
cwhelan · May 25, 2018 · 10c59ec · 10c59ec
1 parent 585c69b
commit 10c59ec
Show file tree

Hide file tree

Showing 23 changed files with 1,346 additions and 125 deletions.
diff --git a/scripts/funcotator/data_sources/getDbSNP.sh b/scripts/funcotator/data_sources/getDbSNP.sh
@@ -0,0 +1,278 @@
+#!/usr/bin/env bash
+
+################################################################################
+
+# Creates a set of files that allows for creating annotations based off dbSNP
+# Pulled directly from the ncbi FTP site.
+
+################################################################################
+
+#Setup variables for the script:
+SCRIPTDIR="$( cd -P "$( dirname "$0" )" && pwd )"
+SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
+MINARGS=0
+MAXARGS=0
+
+FTP_BASE_URL='ftp://ftp.ncbi.nih.gov/snp/organisms/'
+BUILD_NUMBER='150'
+
+DATA_SOURCE_NAME="dbSNP"
+OUT_DIR_NAME='dbsnp'
+
+#SRC_FILE_BASE_NAME="All_"
+SRC_FILE_BASE_NAME="common_all_"
+
+SRC_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz\\s*\$"
+MD5_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz.md5\\s*\$"
+TBI_FILE_REGEX="${SRC_FILE_BASE_NAME}\\d+.vcf.gz.tbi\\s*\$"
+
+################################################################################
+
+function simpleUsage()
+{
+  echo -e "Usage: $SCRIPTNAME [OPTIONS] ..."
+  echo -e "Creates the data sources folder for dbSnp for the GATK Funcotator tool."
+}
+
+#Define a usage function:
+function usage()
+{
+  simpleUsage
+  echo -e ""
+  echo -e "Will download all data sources directly from the NCBI website:"
+  echo -e "    ${FTP_BASE_URL}"
+  echo -e ""
+  echo -e "Return values:"
+  echo -e "  0  NORMAL"
+  echo -e "  1  TOO MANY ARGUMENTS"
+  echo -e "  2  TOO FEW ARGUMENTS"
+  echo -e "  3  UNKNOWN ARGUMENT"
+  echo -e "  4  BAD CHECKSUM"
+  echo -e "  5  OUTPUT DIRECTORY ALREADY EXISTS"
+  echo -e ""
+}
+
+#Display a message to std error:
+function error()
+{
+  echo "$1" 2>&1
+}
+
+TMPFILELIST=''
+function makeTemp()
+{
+  local f
+  f=$( mktemp )
+  TMPFILELIST="${TMPFILELIST} $f"
+  echo $f
+}
+
+function cleanTempVars()
+{
+  rm -f ${TMPFILELIST}
+}
+
+function at_exit()
+{
+  cleanTempVars
+}
+
+################################################################################
+
+
+function createConfigFile() {
+
+    local dataSourceName=$1
+    local version=$2
+    local srcFile=$3
+    local originLocation=$4
+
+    echo "name = ${dataSourceName}"
+    echo "version = ${version}"
+    echo "src_file = ${srcFile}"
+    echo "origin_location = ${originLocation}"
+    echo "preprocessing_script = ${SCRIPTNAME}"
+    echo ""
+    echo "# Supported types:"
+    echo "# simpleXSV    -- Arbitrary separated value table (e.g. CSV), keyed off Gene Name OR Transcript ID"
+    echo "# locatableXSV -- Arbitrary separated value table (e.g. CSV), keyed off a genome location"
+    echo "# gencode      -- Custom datasource class for GENCODE"
+    echo "# cosmic       -- Custom datasource class for COSMIC"
+    echo "# vcf          -- Custom datasource class for Variant Call Format (VCF) files"
+    echo "type = vcf"
+    echo ""
+    echo "# Required field for GENCODE files."
+    echo "# Path to the FASTA file from which to load the sequences for GENCODE transcripts:"
+    echo "gencode_fasta_path ="
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# Valid values:"
+    echo "#     GENE_NAME"
+    echo "#     TRANSCRIPT_ID"
+    echo "xsv_key ="
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# The 0-based index of the column containing the key on which to match"
+    echo "xsv_key_column ="
+    echo ""
+    echo "# Required field for simpleXSV AND locatableXSV files."
+    echo "# The delimiter by which to split the XSV file into columns."
+    echo "xsv_delimiter ="
+    echo ""
+    echo "# Required field for simpleXSV files."
+    echo "# Whether to permissively match the number of columns in the header and data rows"
+    echo "# Valid values:"
+    echo "#     true"
+    echo "#     false"
+    echo "xsv_permissive_cols ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the contig for each row"
+    echo "contig_column ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the start position for each row"
+    echo "start_column ="
+    echo ""
+    echo "# Required field for locatableXSV files."
+    echo "# The 0-based index of the column containing the end position for each row"
+    echo "end_column ="
+    echo ""
+
+}
+
+function downloadAndVerifyVcfFiles() {
+
+    local remoteFolder=$1
+    local outputFolder=$2
+    local filePrefix=$3
+
+    local listingFile=$( makeTemp )
+    local indentSpace="    "
+    local version=$( echo "${remoteFolder}" | sed "s#.*human_\\(.*b${BUILD_NUMBER}\\).*#\\1#g" )
+
+    curl ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/ 2>/dev/null > ${listingFile}
+
+    vcfFile=$( cat ${listingFile} | awk '{print $9}' | grep -E ${SRC_FILE_REGEX} )
+    tbiFile=$( cat ${listingFile} | awk '{print $9}' | grep -E ${TBI_FILE_REGEX} )
+    md5File=$( cat ${listingFile} | awk '{print $9}' | grep -E ${MD5_FILE_REGEX} )
+
+    echo "${indentSpace}Retrieving MD5 sum file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File} ... "
+    wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File}
+
+    echo "${indentSpace}Retrieving VCF file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} ... "
+    wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}
+
+    echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
+    wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}
+
+    echo "${indentSpace}Verifying VCF checksum ..."
+    if [[ "$(uname)" == "Darwin" ]] ; then
+        which md5sum-lite &> /dev/null
+        r=$?
+        if [ $r == 0 ] ; then
+            checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
+            expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
+
+            if [[ "${checksum}" != "${expected}" ]] ; then
+                error "DOWNLOADED FILE IS CORRUPT!  (${checksum} != ${expected})"
+                error "FAILING"
+                exit 4
+            fi
+        else
+            error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'.  Use these data with caution."
+        fi
+    else
+        which md5sum &> /dev/null
+        r=$?
+        if [ $r == 0 ] ; then
+            checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
+            expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
+
+            if [[ "${checksum}" != "${expected}" ]] ; then
+                error "DOWNLOADED FILE IS CORRUPT!  (${checksum} != ${expected})"
+                error "FAILING"
+                exit 4
+            fi
+        else
+            error "Unable to validate md5sum of file: cannot locate 'md5sum'.  Use these data with caution."
+        fi
+    fi
+
+    # Now put it in the right place and clean up:
+    echo "${indentSpace}Creating output directory ..."
+    mkdir -p ${outputFolder}
+
+    echo "${indentSpace}Moving files to output directory ..."
+    mv ${vcfFile} ${outputFolder}/${filePrefix}_${vcfFile}
+    mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
+    rm ${md5File}
+
+    echo "${indentSpace}Creating Config File ... "
+    createConfigFile "${DATA_SOURCE_NAME}" "${version}" ${filePrefix}_${vcfFile} "ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}" > ${outputFolder}/${DATA_SOURCE_NAME}.config
+}
+
+################################################################################
+
+trap at_exit EXIT
+
+################################################################################
+
+#Check given arguments:
+if [[ $# -gt $MAXARGS ]] ; then
+  usage
+  exit 1
+elif [[ $# -lt $MINARGS ]] ; then
+  usage
+  exit 2
+fi
+
+################################################################################
+
+#Read args:
+while [ $# -gt 0 ] ; do
+
+  case "$1" in
+    -h|--h|--help|-help|help)
+      usage;
+      exit 0;
+      ;;
+    *)
+      error "${SCRIPTNAME}: invalid option: $1"
+      simpleUsage
+      echo "Try \`$SCRIPTNAME --help' for more information."
+      exit 3;
+    ;;
+  esac
+
+  #Get next argument in $1:
+  shift
+done
+
+################################################################################
+# Do the work here:
+
+# Make sure we don't have anything in our out folder already:
+if [[ -d ${OUT_DIR_NAME} ]] ; then
+    error "Output directory already exists: ${OUT_DIR_NAME} - aborting!"
+    exit 5
+fi
+
+echo "Querying NCBI listing..."
+tmpListing="$( makeTemp )"
+curl ftp://ftp.ncbi.nih.gov/snp/organisms/ 2>/dev/null > ${tmpListing}
+
+# Get the link for HG19:
+echo "Processing HG19 ..."
+hg19Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh37' | grep "b${BUILD_NUMBER}" )
+
+downloadAndVerifyVcfFiles ${hg19Version} ${OUT_DIR_NAME}/hg19 "hg19"
+
+
+# Get the link for HG38:
+echo "Processing HG38 ..."
+hg38Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh38' | grep "b${BUILD_NUMBER}" )
+
+downloadAndVerifyVcfFiles ${hg38Version} ${OUT_DIR_NAME}/hg38 "hg38"
+
+exit 0
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -21,7 +21,6 @@
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
 import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
-import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.transformers.ReadTransformer;
 import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
@@ -812,7 +811,7 @@ protected FeatureInput<? extends Feature> addFeatureInputsAfterInitialization(fi
                                                                                   final String name,
                                                                                   final Class<? extends Feature> featureType) {
 
-        final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(name + ":" + filePath);
+        final FeatureInput<? extends Feature> featureInput = new FeatureInput<>(name + FeatureInput.FEATURE_ARGUMENT_TAG_DELIMITER + filePath);
 
         //Add datasource to the feature manager too so that it can be queried. Setting lookahead to 0 to avoid caching.
         //Note: we are disabling lookahead here because of windowed queries that need to "look behind" as well.