broadinstitute · jonn-smith · May 29, 2018 · May 15, 2018 · May 29, 2018 · LeeTL1220
diff --git a/scripts/funcotator/data_sources/getDnaRepairGenes.py b/scripts/funcotator/data_sources/getDnaRepairGenes.py
@@ -53,23 +53,35 @@
         writer = csv.writer(f, delimiter='|', lineterminator="\n")
 
         isFirstRow = True
+        headerRow = []
+        prevRow = []
 
         for row in dna_repair_table:
 
-            strippedRow = map(lambda x: x.strip(), row)
+            # Strip leading/trailing whitespace and replace newlines:
+            strippedRow = map(lambda x: x.strip().replace('\n', ' '), row)
 
             # Need to fix the first header:
             if isFirstRow:
                 strippedRow[0] = "Gene Name"
+                headerRow = strippedRow
                 isFirstRow = False
 
             # Skip empty rows:
             if len(filter(lambda x: x is not '', strippedRow)) == 0:
                 continue
+
             # Skip section header rows:
             if len(filter(lambda x: x.lower() == 'top of page', strippedRow)) >= 1:
                 continue
-
+
+            # If we have fewer fields than our header row, we have to grab the
+            # "Activity" from the previous row (due to how the table parser works).
+            # "Activity" is the second column.
+            if len(strippedRow) < len(headerRow):
+                strippedRow.insert(1, prevRow[1])
+
+            prevRow = strippedRow
             writer.writerow(strippedRow)
 
     print 'DONE!'
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorUtils.java
@@ -327,15 +327,20 @@ public static int getAlignedEndPosition(final int alleleEndPosition) {
     /**
      * Gets the sequence aligned position (1-based, inclusive) for the given coding sequence position.
      * This will produce the next lowest position evenly divisible by 3, such that a codon starting at this returned
-     * position would include the given position.
-     * @param position A sequence starting coordinate for which to produce an coding-aligned position.  Must be > 0.
+     * position would include the given position.  This can be a negative number, in which case the codon would start
+     * at the given position relative to the normal starting position (1).
+     * @param position A sequence starting coordinate for which to produce an coding-aligned position.
      * @return A coding-aligned position (1-based, inclusive) corresponding to the given {@code position}.
      */
     public static int getAlignedPosition(final int position) {
 
-        ParamUtils.isPositive( position, "Genomic positions must be > 0." );
-
-        return position - ((position - 1) % 3);
+        if ( position > 0 ) {
+            return position - ((position - 1) % 3);
+        }
+        else {
+            final int adjustedPos = 1 - position;
+            return -(adjustedPos - ((adjustedPos - 1) % 3) + 1);
+        }
     }
 
     /**
@@ -346,10 +351,21 @@ public static int getAlignedPosition(final int position) {
      */
     public static boolean isInFrameWithEndOfRegion(final int startPosition, final int regionLength) {
 
-        ParamUtils.isPositive( startPosition, "Genomic positions must be > 0." );
         ParamUtils.isPositiveOrZero( regionLength, "Region length must be >= 0." );
 
-        return (((regionLength - startPosition + 1) % 3) == 0);
+        // Micro-optimization to split the return statements based on
+        // the if statement so we don't have to do unnecessary math in the "normal" case:
+        if ( startPosition > 0 ) {
+            return (((regionLength - startPosition + 1) % 3) == 0);
+        }
+        else {
+            // If we have a position before our region starts, we must calculate an offset
+            // between the position and the actual start.
+            // We can then add this offset to the start and region length to simplify the calculation.
+            final int preFlankOffset = 1 - startPosition;
+
+            return ((((regionLength + preFlankOffset) - (startPosition + preFlankOffset) + 1) % 3) == 0);
+        }
     }
 
     /**
@@ -1387,11 +1403,12 @@ public static String getAlignedCodingSequenceAllele(final String codingSequence,
 
     /**
      * Get the aligned coding sequence for the given reference allele.
+     * NOTE: alignedRefAlleleStart must be less than or equal to codingSequenceRefAlleleStart.
      * @param referenceSnippet {@link String} containing a short excerpt of the reference sequence around the given reference allele.  Must not be {@code null}.
      * @param referencePadding Number of bases in {@code referenceSnippet} before the reference allele starts.  This padding exists at the end as well (plus some other bases to account for the length of the alternate allele if it is longer than the reference).  Must be >= 0.
      * @param refAllele The reference {@link Allele}.  Must not be {@code null}.
-     * @param codingSequenceRefAlleleStart The position (1-based, inclusive) in the coding sequence where the {@code refAllele} starts.  Must be > 0.
-     * @param alignedRefAlleleStart The in-frame position (1-based, inclusive) of the first base of the codon containing the reference allele.  Must be > 0.
+     * @param codingSequenceRefAlleleStart The position (1-based, inclusive) in the coding sequence where the {@code refAllele} starts.
+     * @param alignedRefAlleleStart The in-frame position (1-based, inclusive) of the first base of the codon containing the reference allele.
      * @return A {@link String} of in-frame codons that contain the entire reference allele.
      */
     public static String getAlignedRefAllele(final String referenceSnippet,
@@ -1402,14 +1419,14 @@ public static String getAlignedRefAllele(final String referenceSnippet,
 
         Utils.nonNull(referenceSnippet);
         Utils.nonNull(refAllele);
-
         ParamUtils.isPositiveOrZero( referencePadding, "Padding must be >= 0." );
-        ParamUtils.isPositive( codingSequenceRefAlleleStart, "Genome positions must be > 0." );
-        ParamUtils.isPositive( alignedRefAlleleStart, "Genome positions must be > 0." );
+        Utils.validate( alignedRefAlleleStart <= codingSequenceRefAlleleStart, "The alignedRefAlleleStart must be less than or equal to codingSequenceRefAlleleStart!" );
+
 
         final int extraBasesNeeded = (codingSequenceRefAlleleStart - alignedRefAlleleStart);
         int refStartPos = referencePadding - extraBasesNeeded;
 
+        // TODO: This should probably be an error condition:
         if ( refStartPos < 0 ) {
             refStartPos = 0;
         }
@@ -1489,6 +1506,7 @@ public static int getProteinChangePosition(final Integer alignedCodingSequenceAl
 
         Utils.nonNull(alignedCodingSequenceAlleleStart);
         ParamUtils.isPositive( alignedCodingSequenceAlleleStart, "Genome positions must be > 0." );
+
         return ((alignedCodingSequenceAlleleStart-1) / 3) + 1; // Add 1 because we're 1-based.
     }
 

diff --git a/...oadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java b/...oadinstitute/hellbender/tools/funcotator/dataSources/cosmic/CosmicFuncotationFactory.java
@@ -252,9 +252,9 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var
                     try ( final ResultSet resultSet = statement.executeQuery(RESULT_QUERY_TEMPLATE + "\"" + geneName + "\";") ) {
                         // iterate through our results:
                         while ( resultSet.next() ) {
-                            // Get the genome position and protein position:
+
+                            // Get the genome position:
                             final SimpleInterval cosmicGenomePosition = getGenomePositionFromResults(resultSet);
-                            final SimpleInterval cosmicProteinPosition = getProteinPositionFromResults(resultSet);
 
                             // Try to match on genome position first:
                             if ( cosmicGenomePosition != null ) {
@@ -265,6 +265,9 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var
                                 }
                             }
 
+                            // Get the protein position:
+                            final SimpleInterval cosmicProteinPosition = getProteinPositionFromResults(resultSet);
+
                             // Now try to match on protein position:
                             if ( proteinPosition != null ) {
                                 // If we overlap the records, we update the counter:
@@ -332,7 +335,15 @@ private final SimpleInterval getGenomePositionFromResults(final ResultSet result
                 final int start = Integer.valueOf(matcher.group(2));
                 final int end = Integer.valueOf(matcher.group(3));
 
-                return new SimpleInterval(contig, start, end);
+                try {
+                    return new SimpleInterval(contig, start, end);
+                }
+                catch (final IllegalArgumentException ex) {
+                    // If we have poorly bounded genomic positions, we need to warn the user and move on.
+                    // These may occur occasionally in the data.
+                    logger.warn("Warning - unable to parse genome position string due to invalid position information.  Ignoring potential COSMIC match with genome position: " + rawPosition);
+                    return null;
+                }
             }
         }
         catch (final SQLException ex) {
@@ -369,18 +380,51 @@ private final SimpleInterval parseProteinString(final String proteinPositionStri
 
         final Matcher matcher = PROTEIN_POSITION_REGEX.matcher(proteinPositionString);
         if ( matcher.matches() ) {
-            // We have a position, so we should parse it.
-            // If we have an end position, we should use it:
-            if ( matcher.group(2) != null ) {
-                return new SimpleInterval(PROTEIN_CONTIG, Integer.valueOf(matcher.group(1)), Integer.valueOf(matcher.group(2)));
+            try {
+
+                // We have a position, so we should parse it.
+                // If we have an end position, we should use it:
+                if ( matcher.group(2) != null ) {
+                    return new SimpleInterval(PROTEIN_CONTIG, Integer.valueOf(matcher.group(1)), Integer.valueOf(matcher.group(2)));
+                }
+                else {
+                    return new SimpleInterval(PROTEIN_CONTIG, Integer.valueOf(matcher.group(1)), Integer.valueOf(matcher.group(1)));
+                }
             }
-            else {
-                return new SimpleInterval(PROTEIN_CONTIG, Integer.valueOf(matcher.group(1)), Integer.valueOf(matcher.group(1)));
+            catch (final IllegalArgumentException ex) {
+                // If we have poorly bounded protein changes, we need to warn the user and move on.
+                // These occur occasionally in the data.
+                logger.warn("Warning - unable to parse protein string due to invalid position information.  Ignoring potential COSMIC match with protein sequence: " + proteinPositionString);
+                return null;
             }
         }
         return null;
     }
 
+    /**
+     * Print the given {@link ResultSet} to stdout.
+     * @param resultSet The {@link ResultSet} to print.
+     */
+    private final void printResultSet(final ResultSet resultSet) {
+        try {
+            final ResultSetMetaData metadata    = resultSet.getMetaData();
+            final int               columnCount = metadata.getColumnCount();
+            for (int i = 1; i <= columnCount; i++) {
+                System.out.print(metadata.getColumnName(i) + ", ");
+            }
+            System.out.println();
+            while ( resultSet.next() ) {
+                for ( int i = 1; i <= columnCount; i++ ) {
+                    System.out.print(resultSet.getString(i) + ", ");
+                }
+                System.out.println();
+            }
+        }
+        catch (final SQLException ex) {
+            throw new GATKException("Cannot print resultSet!");
+        }
+    }
+
     //==================================================================================================================
     // Helper Data Types: