Trim per-allele FORMAT annotations and optionally retain raw AS annot…

…ations (#5833) * Update A-type and R-type FORMAT annotations in GenotypeGVCFs * Add option to keep combined AS annotations, as requested in #5698
broadinstitute · Mar 28, 2019 · 2c066f5 · 2c066f5
1 parent 18e4fc5
commit 2c066f5
Show file tree

Hide file tree

Showing 23 changed files with 969 additions and 63 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
@@ -16,19 +16,15 @@
 import org.broadinstitute.barclay.argparser.ArgumentCollection;
 import org.broadinstitute.barclay.argparser.BetaFeature;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
-import org.broadinstitute.barclay.argparser.*;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
 import org.broadinstitute.hellbender.engine.AssemblyRegion;
 import org.broadinstitute.hellbender.engine.AssemblyRegionEvaluator;
 import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ShardBoundary;
-import org.broadinstitute.hellbender.engine.*;
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.engine.spark.*;
-import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
-import org.broadinstitute.hellbender.engine.spark.SparkSharder;
 import org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSink;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
@@ -41,7 +37,6 @@
 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
-import scala.Tuple2;
 
 import java.io.IOException;
 import java.nio.file.Path;
@@ -51,9 +46,6 @@
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
-import java.util.function.Function;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 
 /**
  * ********************************************************************************
@@ -181,7 +173,7 @@ private static void processAssemblyRegions(
             final Logger logger,
             final boolean createOutputVariantIndex) {
 
-        final VariantAnnotatorEngine variantannotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
+        final VariantAnnotatorEngine variantannotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
 
         final Path referencePath = IOUtils.getPath(reference);
         final ReferenceSequenceFile driverReferenceSequenceFile = new CachingIndexedFastaSequenceFile(referencePath);
@@ -237,7 +229,7 @@ protected Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSu
         final String pathOnExecutor = SparkFiles.get(referenceFileName);
         final ReferenceSequenceFile taskReferenceSequenceFile = new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
         final Collection<Annotation> annotations = makeVariantAnnotations();
-        final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
+        final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
         return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, getHeaderForReads(), taskReferenceSequenceFile, annotatorEngine);
     }
 
@@ -250,7 +242,7 @@ private static Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvalua
         final Path referencePath = IOUtils.getPath(reference);
         final String referenceFileName = referencePath.getFileName().toString();
         final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
-        final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
+        final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
         return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, header, taskReferenceSequenceFile, annotatorEngine);
     }
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java
@@ -257,7 +257,7 @@ public void onTraversalStart() {
         }
 
         // create the annotation engine
-        annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false);
+        annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, false);
 
         vcfWriter = getVCFWriter();
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
@@ -16,7 +16,6 @@
 import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
 import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider;
 import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
-import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection;
 import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
@@ -94,6 +93,8 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
     public static final String ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME = "only-output-calls-starting-in-intervals";
     public static final String ALL_SITES_LONG_NAME = "include-non-variant-sites";
     public static final String ALL_SITES_SHORT_NAME = "all-sites";
+    public static final String KEEP_COMBINED_LONG_NAME = "keep-combined-raw-annotations";
+    public static final String KEEP_COMBINED_SHORT_NAME = "keep-combined";
     private static final String GVCF_BLOCK = "GVCFBlock";
     private VCFHeader outputHeader;
 
@@ -127,6 +128,12 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
     @Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic")
     protected double afTolerance = 1e-3;  //based on Q30 as a "good" base quality score
 
+    /**
+     * If specified, keep the combined raw annotations (e.g. AS_SB_TABLE) after genotyping.  This is applicable to Allele-Specific annotations
+     */
+    @Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations")
+    protected boolean keepCombined = false;
+
     @ArgumentCollection
     private GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
 
@@ -199,7 +206,7 @@ public void onTraversalStart() {
 
         final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder?
 
-        annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false);
+        annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
 
         // Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list
         for ( final InfoFieldAnnotation annotation :  annotationEngine.getInfoAnnotations() ) {
@@ -251,6 +258,9 @@ private void setupVCFWriter(final VCFHeader inputVCFHeader, final SampleList sam
         headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
         headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
         headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));   // needed for gVCFs without DP tags
+        if (keepCombined) {
+            headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_QUAL_KEY));
+        }
         if ( dbsnp.dbsnp != null  ) {
             VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
         }
@@ -325,7 +335,10 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
                 // it difficult to recover the data mapping due to the keyed alleles no longer being present in the variant context.
                 final VariantContext withGenotypingAnnotations = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
                 final VariantContext withAnnotations = annotationEngine.finalizeAnnotations(withGenotypingAnnotations, originalVC);
-                result = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations);
+                final int[] relevantIndices = regenotypedVC.getAlleles().stream().mapToInt(a -> originalVC.getAlleles().indexOf(a)).toArray();
+                final VariantContext trimmed = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations);
+                final GenotypesContext updatedGTs = subsetAlleleSpecificFormatFields(outputHeader, trimmed.getGenotypes(), relevantIndices);
+                result = new VariantContextBuilder(trimmed).genotypes(updatedGTs).make();
             } else if (includeNonVariants) {
                 result = originalVC;
             } else {
@@ -395,6 +408,28 @@ private VariantContext removeNonRefAlleles(final VariantContext vc) {
         }
     }
 
+    private GenotypesContext subsetAlleleSpecificFormatFields(final VCFHeader outputHeader, final GenotypesContext originalGs, final int[] relevantIndices) {
+        final GenotypesContext newGTs = GenotypesContext.create(originalGs.size());
+        for (final Genotype g : originalGs) {
+            final GenotypeBuilder gb = new GenotypeBuilder(g);
+            final Set<String> keys = g.getExtendedAttributes().keySet();
+            for (final String key : keys) {
+                final VCFFormatHeaderLine headerLine = outputHeader.getFormatHeaderLine(key);
+                final Object attribute;
+                if (headerLine.getCountType().equals(VCFHeaderLineCount.INTEGER) && headerLine.getCount() == 1) {
+                    attribute = g.getAnyAttribute(key);
+                }
+                else {
+                    attribute = ReferenceConfidenceVariantContextMerger.generateAnnotationValueVector(headerLine.getCountType(),
+                            GATKProtectedVariantContextUtils.attributeToList(g.getAnyAttribute(key)), relevantIndices);
+                }
+                gb.attribute(key, attribute);
+            }
+            newGTs.add(gb.make());
+        }
+        return newGTs;
+    }
+
     /**
      * Re-genotype (and re-annotate) a combined genomic VC
      * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java
@@ -95,15 +95,26 @@ protected int[][] getTableFromSamples( final GenotypesContext genotypes, final i
         return foundData ? decodeSBBS(sbArray) : null ;
     }
 
+    @SuppressWarnings("unchecked")
     public static int[] getStrandCounts(final Genotype g) {
         int[] data;
         if ( g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(String.class)) {
             final String sbbsString = (String)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
             data = encodeSBBS(sbbsString);
         } else if (g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(ArrayList.class)) {
-            @SuppressWarnings("unchecked")
-            final List<Integer> sbbsList = (ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
-            data = encodeSBBS(sbbsList);
+           if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof Integer) {
+                data = encodeSBBS((ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY));
+            }
+            else if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof String) {
+                final List<Integer> sbbsList = new ArrayList<>();
+                for (final Object o : (ArrayList<Object>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
+                    sbbsList.add(Integer.parseInt(o.toString()));
+                }
+                data = encodeSBBS(sbbsList);
+            } else {
+                throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
+            }
+
         } else {
             throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
@@ -178,7 +178,7 @@ public void onTraversalStart() {
         final  List<String> samples = getHeaderForVariants().getGenotypeSamples();
         variantSamples = new IndexedSampleList(samples);
 
-        annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, comps, false);
+        annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, comps, false, false);
         annotatorEngine.addExpressions(expressionsToUse, resources, expressionAlleleConcordance );
 
         // setup the header fields

diff --git a/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -35,26 +35,28 @@ public final class VariantAnnotatorEngine {
     private final VariantOverlapAnnotator variantOverlapAnnotator;
     private boolean expressionAlleleConcordance;
     private final boolean useRawAnnotations;
+    private final boolean keepRawCombinedAnnotations;
 
     private final static Logger logger = LogManager.getLogger(VariantAnnotatorEngine.class);
 
     /**
      * Creates an annotation engine from a list of selected annotations output from command line parsing
      * @param annotationList list of annotation objects (with any parameters already filled) to include
      * @param dbSNPInput input for variants from a known set from DbSNP or null if not provided.
-     *                   The annotation engine will mark variants overlapping anything in this set using {@link htsjdk.variant.vcf.VCFConstants#DBSNP_KEY}.
+     *                   The annotation engine will mark variants overlapping anything in this set using {@link VCFConstants#DBSNP_KEY}.
      * @param featureInputs list of inputs with known variants.
-     *                   The annotation engine will mark variants overlapping anything in those sets using the name given by {@link FeatureInput#getName()}.
-     *                   Note: the DBSNP FeatureInput should be passed in separately, and not as part of this List - an GATKException will be thrown otherwise.
-     *                   Note: there are no non-DBSNP comparison FeatureInputs an empty List should be passed in here, rather than null.
+ *                   The annotation engine will mark variants overlapping anything in those sets using the name given by {@link FeatureInput#getName()}.
+ *                   Note: the DBSNP FeatureInput should be passed in separately, and not as part of this List - an GATKException will be thrown otherwise.
+ *                   Note: there are no non-DBSNP comparison FeatureInputs an empty List should be passed in here, rather than null.
      * @param useRaw When this is set to true, the annotation engine will call {@link ReducibleAnnotation#annotateRawData(ReferenceContext, VariantContext, ReadLikelihoods)}
-     *               on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, ReadLikelihoods)},
-     *               which is the default for all annotations.
+*               on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, ReadLikelihoods)},
+     * @param keepCombined If true, retain the combined raw annotation values instead of removing them after finalizing
      */
     public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
-                                   final FeatureInput<VariantContext> dbSNPInput,
-                                   final List<FeatureInput<VariantContext>> featureInputs,
-                                   final boolean useRaw){
+                                  final FeatureInput<VariantContext> dbSNPInput,
+                                  final List<FeatureInput<VariantContext>> featureInputs,
+                                  final boolean useRaw,
+                                  boolean keepCombined){
         Utils.nonNull(featureInputs, "comparisonFeatureInputs is null");
         infoAnnotations = new ArrayList<>();
         genotypeAnnotations = new ArrayList<>();
@@ -69,6 +71,7 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
         variantOverlapAnnotator = initializeOverlapAnnotator(dbSNPInput, featureInputs);
         reducibleKeys = new HashSet<>();
         useRawAnnotations = useRaw;
+        keepRawCombinedAnnotations = keepCombined;
         for (InfoFieldAnnotation annot : infoAnnotations) {
             if (annot instanceof ReducibleAnnotation) {
                 reducibleKeys.add(((ReducibleAnnotation) annot).getRawKeyName());
@@ -118,12 +121,17 @@ public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
      * Returns the set of descriptions to be added to the VCFHeader line (for all annotations in this engine).
      * @param useRaw Whether to prefer reducible annotation raw key descriptions over their normal descriptions
      */
-    public Set<VCFHeaderLine> getVCFAnnotationDescriptions(boolean useRaw) {
+    public Set<VCFHeaderLine> getVCFAnnotationDescriptions(final boolean useRaw) {
         final Set<VCFHeaderLine> descriptions = new LinkedHashSet<>();
 
         for ( final InfoFieldAnnotation annotation : infoAnnotations) {
-            if (annotation instanceof ReducibleAnnotation && useRaw) {
-                descriptions.addAll(((ReducibleAnnotation)annotation).getRawDescriptions());
+            if (annotation instanceof ReducibleAnnotation) {
+                if (useRaw || keepRawCombinedAnnotations) {
+                    descriptions.addAll(((ReducibleAnnotation) annotation).getRawDescriptions());
+                }
+                if (!useRaw) {
+                    descriptions.addAll(annotation.getDescriptions());
+                }
             } else {
                 descriptions.addAll(annotation.getDescriptions());
             }
@@ -221,7 +229,9 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
                     variantAnnotations.putAll(annotationsFromCurrentType);
                 }
                 //clean up raw annotation data after annotations are finalized
-                variantAnnotations.remove(currentASannotation.getRawKeyName());
+                if (!keepRawCombinedAnnotations) {
+                    variantAnnotations.remove(currentASannotation.getRawKeyName());
+                }
             }
         }
 

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -4,7 +4,6 @@
 import htsjdk.variant.variantcontext.writer.VariantContextWriter;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.ArgumentCollection;
-import org.broadinstitute.barclay.argparser.CommandLineException;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
@@ -221,7 +220,7 @@ public void onTraversalStart() {
         }
 
         final VariantAnnotatorEngine variantAnnotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(),
-                hcArgs.dbsnp.dbsnp, hcArgs.comps,  hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
+                hcArgs.dbsnp.dbsnp, hcArgs.comps,  hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
         hcEngine = new HaplotypeCallerEngine(hcArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), getReferenceReader(referenceArguments), variantAnnotatorEngine);
 
         // The HC engine will make the right kind (VCF or GVCF) of writer for us