(SV) Overhaul tests on assembly-based non-complex breakpoint and type…

… inference code Also * expand cases on BND format variants * change to how VariantContext are extracted from NARL and SVType * refactor the tool DiscoverVariantsFromContigAlignmentsSAMSpark into a new worker class ContigChimericAlignmentIterativeInterpreter
broadinstitute · Jun 22, 2018 · aee9931 · aee9931
1 parent 3a8885b
commit aee9931
Show file tree

Hide file tree

Showing 56 changed files with 4,724 additions and 3,839 deletions.
diff --git a/...adinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java b/...adinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java
@@ -239,12 +239,6 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
         @Argument(doc = "Maximum size deletion to call based on imprecise evidence without corroborating read depth evidence",
                 fullName = "max-callable-imprecise-deletion-size", optional=true)
         public int maxCallableImpreciseVariantDeletionSize = DEFAULT_MAX_CALLABLE_IMPRECISE_DELETION_SIZE;
-
-        @Advanced
-        @Hidden
-        @Argument(doc = "output cpx variants in a format that is more human friendly, primarily for debugging purposes",
-                fullName = "cpx-for-human-eye", optional = true)
-        public boolean outputCpxResultsInHumanReadableFormat = false;
     }
 
 }
diff --git a/...g/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java b/...g/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java
@@ -27,6 +27,7 @@
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContigGenerator;
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.ContigAlignmentsModifier;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter;
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ImpreciseVariantDetector;
 import org.broadinstitute.hellbender.tools.spark.sv.evidence.AlignedAssemblyOrExcuse;
 import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
@@ -176,10 +177,8 @@ protected void runTool( final JavaSparkContext ctx ) {
         final SvDiscoveryInputMetaData svDiscoveryInputMetaData = getSvDiscoveryInputData(ctx, headerForReads, assembledEvidenceResults);
 
         // TODO: 1/14/18 this is to be phased-out: old way of calling precise variants
-        // assembled breakpoints
-        @SuppressWarnings("deprecation")
         List<VariantContext> assemblyBasedVariants =
-                org.broadinstitute.hellbender.tools.spark.sv.discovery.DiscoverVariantsFromContigAlignmentsSAMSpark
+                ContigChimericAlignmentIterativeInterpreter
                         .discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedAlignments);
 
         final List<VariantContext> annotatedVariants = processEvidenceTargetLinks(assemblyBasedVariants, svDiscoveryInputMetaData);

diff --git a/...java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java b/...java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java
diff --git a/...main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/BreakEndVariantType.java b/...main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/BreakEndVariantType.java
diff --git a/...ute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java b/...ute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVType.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SimpleSVType.java
diff --git a/...hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java b/...hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java
@@ -20,6 +20,7 @@
 import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
 import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
 import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryPipelineSpark;
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.*;
 import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter;
@@ -34,6 +35,8 @@
 import scala.Tuple2;
 
 import java.io.Serializable;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -138,7 +141,7 @@ protected void runTool(final JavaSparkContext ctx) {
         final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
                 StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
                         discoverStageArgs.cnvCallsFile);
-        final String outputPrefixWithSampleName = outputPrefix + SVUtils.getSampleId(getHeaderForReads()) + "_";
+        final String outputPrefixWithSampleName = getOutputPrefix();
         final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
                 new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
                         null, null, null,
@@ -152,6 +155,20 @@ protected void runTool(final JavaSparkContext ctx) {
         dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, writeSAMFiles);
     }
 
+    /**
+     * @return prefix of outputs, with {@link #outputPrefix} decorated with sample name and trailing underscore
+     */
+    private String getOutputPrefix() {
+        if ( Files.exists(Paths.get(outputPrefix)) ) {
+            if (Files.isDirectory(Paths.get(outputPrefix))) // existing directory
+                return outputPrefix + (outputPrefix.endsWith("/") ? "" : "/") + SVUtils.getSampleId(getHeaderForReads()) + "_";
+            else
+                throw new UserException("Provided prefix for output is pointing to an existing file: " + outputPrefix); // to avoid accidental override of a file
+        } else { // prefixForOutput doesn't point to an existing file or directory
+            return outputPrefix + (outputPrefix.endsWith("/") ? "" : "_") + SVUtils.getSampleId(getHeaderForReads()) + "_";
+        }
+    }
+
     //==================================================================================================================
 
     public static final class AssemblyContigsClassifiedByAlignmentSignatures {

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java
@@ -1,41 +1,169 @@
 package org.broadinstitute.hellbender.tools.spark.sv.discovery;
 
 import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.vcf.VCFConstants;
+import org.apache.commons.lang3.EnumUtils;
+import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
 
-import java.util.Collections;
-import java.util.Map;
+import java.io.IOException;
+import java.util.*;
+
+import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.INTERVAL_VARIANT_ID_FIELD_SEPARATOR;
 
 
 /**
- * Various types of structural variations
+ * Various types of structural variations.
+ * Holding minimum information on VariantContext
+ * it can hold:
+ * CHR, POS, ID, REF, ALT, SVTYPE, END (optional), SVLEN(optional)
  */
 public abstract class SvType {
+    public static final int NO_APPLICABLE_END = -1;
+    public static final int NO_APPLICABLE_LEN = -1;
 
+    // fields whose name starts with "variant" are following the VCF spec, i.e. they can be used directly to construct VariantContextBuilder
+    protected final String variantChromosome;
+    protected final int variantStart;
+    protected final int variantStop;
     protected final String variantId;
+    protected final Allele refAllele;
     protected final Allele altAllele;
     protected final int svLen;
-    protected final Map<String, String> extraAttributes;
+    protected final Map<String, Object> typeSpecificAttributes;
 
-    protected static final Map<String, String> noExtraAttributes = Collections.emptyMap();
+    protected static final Map<String, Object> noExtraAttributes = Collections.emptyMap();
 
-    protected SvType(final String id, final Allele altAllele, final int len, final Map<String, String> typeSpecificExtraAttributes) {
-        variantId = id;
+    public SvType(final String variantChromosome, final int variantStart, final int variantStop, final String variantId,
+                  final Allele refAllele, final Allele altAllele, final int svLen, final Map<String, Object> typeSpecificAttributes) {
+        this.variantChromosome = variantChromosome;
+        this.variantStart = variantStart;
+        this.variantStop = variantStop;
+        this.variantId = variantId;
+        this.refAllele = refAllele;
         this.altAllele = altAllele;
-        svLen = len;
-        extraAttributes = typeSpecificExtraAttributes;
+        this.svLen = svLen;
+        this.typeSpecificAttributes = typeSpecificAttributes;
+    }
+
+    public final VariantContextBuilder getBasicInformation() {
+        if ( ! hasApplicableEnd() ) {
+            VariantContextBuilder builder = new VariantContextBuilder()
+                    .chr(variantChromosome).start(variantStart).stop(variantStart)
+                    .id(variantId)
+                    .alleles(new ArrayList<>(Arrays.asList(refAllele, altAllele)))
+                    .attribute(GATKSVVCFConstants.SVTYPE, toString());
+            typeSpecificAttributes.forEach(builder::attribute);
+            return builder;
+        } else { // assuming if there's valid END, there must be valid SVLEN
+            VariantContextBuilder builder = new VariantContextBuilder()
+                    .chr(variantChromosome).start(variantStart).stop(variantStop)
+                    .id(variantId)
+                    .alleles(new ArrayList<>(Arrays.asList(refAllele, altAllele)))
+                    .attribute(VCFConstants.END_KEY, variantStop)
+                    .attribute(GATKSVVCFConstants.SVTYPE, toString())
+                    .attribute(GATKSVVCFConstants.SVLEN, svLen);
+            typeSpecificAttributes.forEach(builder::attribute);
+            return builder;
+        }
     }
 
+    public String getVariantChromosome() {
+        return variantChromosome;
+    }
+    public int getVariantStart() {
+        return variantStart;
+    }
+    public int getVariantStop() {
+        return variantStop;
+    }
     public final String getInternalVariantId() {
         return variantId;
     }
+    Allele getRefAllele() {
+        return refAllele;
+    }
     public final Allele getAltAllele() {
         return altAllele;
     }
     public final int getSVLength() {
         return svLen;
     }
-    public final Map<String, String> getTypeSpecificAttributes() {
-        return extraAttributes;
+    public final Map<String, Object> getTypeSpecificAttributes() {
+        return typeSpecificAttributes;
+    }
+
+    public abstract boolean hasApplicableEnd();
+    public abstract boolean hasApplicableLength();
+
+    @Override
+    public boolean equals(final Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        final SvType svType = (SvType) o;
+
+        if (svLen != svType.svLen) return false;
+        if (!variantId.equals(svType.variantId)) return false;
+        if (!altAllele.equals(svType.altAllele)) return false;
+        if (typeSpecificAttributes.size() != svType.typeSpecificAttributes.size()) return false;
+
+        for ( final Map.Entry<String, Object> act : typeSpecificAttributes.entrySet() ) {
+            if ( svType.typeSpecificAttributes.containsKey(act.getKey())) {
+                if ( ! act.getValue().equals(svType.typeSpecificAttributes.get(act.getKey())))
+                    return false;
+            } else
+                return false;
+        }
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = variantId.hashCode();
+        result = 31 * result + altAllele.hashCode();
+        result = 31 * result + svLen;
+        result = 31 * result + typeSpecificAttributes.hashCode();
+        return result;
+    }
+
+    // TODO: 5/23/18 any better way to do this?
+    public static SortedSet<String> getKnownTypes() {
+        final SortedSet<String> knownTypes = new TreeSet<>( EnumUtils.getEnumMap(SimpleSVType.SupportedType.class).keySet() );
+
+        knownTypes.add(GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR);
+
+        for (final BreakEndVariantType.SupportedType supportedType : BreakEndVariantType.SupportedType.values()) {
+            knownTypes.add(supportedType.name());
+        }
+
+        return Collections.unmodifiableSortedSet(knownTypes);
+    }
+
+    public static String makeLocationString(final String chr1, final int pos1, final String chr2, final int pos2) {
+        return chr1 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR
+                + pos1 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR
+                + (chr2.equals(chr1) ? "" : chr2 + INTERVAL_VARIANT_ID_FIELD_SEPARATOR)
+                + pos2;
+    }
+
+    public static String makeLocationString(final NovelAdjacencyAndAltHaplotype novelAdjacencyAndAltHaplotype) {
+        String leftContig = novelAdjacencyAndAltHaplotype.getLeftJustifiedLeftRefLoc().getContig();
+        String rightContig = novelAdjacencyAndAltHaplotype.getLeftJustifiedRightRefLoc().getContig();
+        int pos1 = novelAdjacencyAndAltHaplotype.getLeftJustifiedLeftRefLoc().getStart();
+        int pos2 = novelAdjacencyAndAltHaplotype.getLeftJustifiedRightRefLoc().getEnd();
+        return makeLocationString(leftContig, pos1, rightContig, pos2);
+    }
+
+    static byte[] extractRefBases(final SimpleInterval interval, final ReferenceMultiSource reference) {
+        try {
+            return reference.getReferenceBases(interval).getBases();
+        } catch (final IOException ioex) {
+            throw new GATKException("Failed to extract bases from region: " + interval.toString());
+        }
     }
-    public abstract boolean isBreakEndOnly();
 }
diff --git a/...e/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java b/...e/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java
@@ -546,7 +546,7 @@ static boolean gappedAlignmentOffersBetterCoverage(final AlignmentInterval gappe
      *  then this function returns all the original {@code differentRepresentationsForOneContig}
      */
     @VisibleForTesting
-    static List<GoodAndBadMappings> filterSecondaryConfigurationsByMappingQualityThreshold(
+    public static List<GoodAndBadMappings> filterSecondaryConfigurationsByMappingQualityThreshold(
             final List<GoodAndBadMappings> differentRepresentationsForOneContig,
             final int threshold) {
 

diff --git a/.../hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java b/.../hellbender/tools/spark/sv/discovery/alignment/AssemblyContigWithFineTunedAlignments.java
@@ -278,10 +278,10 @@ public static boolean hasIncompletePictureFromTwoAlignments(final AlignmentInter
      *         head and tail alignment contain each other in terms of their reference span
      *     </li>
      *     <li>
-     *         head and tail alignment mapped to different chromosome
+     *         head and tail alignment mapped to different chromosomes
      *     </li>
      *     <li>
-     *         head and tail alignment mapped to different strands
+     *         head and tail alignment mapped to opposite strands of the same chromosome
      *     </li>
      *     <li>
      *         head and tail alignment have reference order switch, indicating we have likely

diff --git a/...roadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java b/...roadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifier.java
@@ -229,7 +229,7 @@ private static Cigar constructNewCigar(final List<CigarElement> left, final List
         return new Cigar(SvCigarUtils.compactifyNeighboringSoftClippings(cigar.getCigarElements()));
     }
 
-    public static enum AlnModType {
+    public enum AlnModType {
         NONE, UNDERGONE_OVERLAP_REMOVAL, EXTRACTED_FROM_LARGER_ALIGNMENT, FROM_SPLIT_GAPPED_ALIGNMENT;
 
         public enum ModTypeString {