Skip to content

Commit

Permalink
Trim per-allele FORMAT annotations and optionally retain raw AS annot…
Browse files Browse the repository at this point in the history
…ations (#5833)

* Update A-type and R-type FORMAT annotations in GenotypeGVCFs
* Add option to keep combined AS annotations, as requested in #5698
  • Loading branch information
ldgauthier authored Mar 28, 2019
1 parent 18e4fc5 commit 2c066f5
Show file tree
Hide file tree
Showing 23 changed files with 969 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,15 @@
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.engine.AssemblyRegionEvaluator;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ShardBoundary;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.*;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.engine.spark.SparkSharder;
import org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSink;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
Expand All @@ -41,7 +37,6 @@
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import scala.Tuple2;

import java.io.IOException;
import java.nio.file.Path;
Expand All @@ -51,9 +46,6 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* ********************************************************************************
Expand Down Expand Up @@ -181,7 +173,7 @@ private static void processAssemblyRegions(
final Logger logger,
final boolean createOutputVariantIndex) {

final VariantAnnotatorEngine variantannotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
final VariantAnnotatorEngine variantannotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);

final Path referencePath = IOUtils.getPath(reference);
final ReferenceSequenceFile driverReferenceSequenceFile = new CachingIndexedFastaSequenceFile(referencePath);
Expand Down Expand Up @@ -237,7 +229,7 @@ protected Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSu
final String pathOnExecutor = SparkFiles.get(referenceFileName);
final ReferenceSequenceFile taskReferenceSequenceFile = new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
final Collection<Annotation> annotations = makeVariantAnnotations();
final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, getHeaderForReads(), taskReferenceSequenceFile, annotatorEngine);
}

Expand All @@ -250,7 +242,7 @@ private static Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvalua
final Path referencePath = IOUtils.getPath(reference);
final String referenceFileName = referencePath.getFileName().toString();
final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, header, taskReferenceSequenceFile, annotatorEngine);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ public void onTraversalStart() {
}

// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false);
annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, false);

vcfWriter = getVCFWriter();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.M2FiltersArgumentCollection;
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -94,6 +93,8 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
public static final String ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME = "only-output-calls-starting-in-intervals";
public static final String ALL_SITES_LONG_NAME = "include-non-variant-sites";
public static final String ALL_SITES_SHORT_NAME = "all-sites";
public static final String KEEP_COMBINED_LONG_NAME = "keep-combined-raw-annotations";
public static final String KEEP_COMBINED_SHORT_NAME = "keep-combined";
private static final String GVCF_BLOCK = "GVCFBlock";
private VCFHeader outputHeader;

Expand Down Expand Up @@ -127,6 +128,12 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic")
protected double afTolerance = 1e-3; //based on Q30 as a "good" base quality score

/**
* If specified, keep the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations
*/
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations")
protected boolean keepCombined = false;

@ArgumentCollection
private GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();

Expand Down Expand Up @@ -199,7 +206,7 @@ public void onTraversalStart() {

final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder?

annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false);
annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);

// Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list
for ( final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations() ) {
Expand Down Expand Up @@ -251,6 +258,9 @@ private void setupVCFWriter(final VCFHeader inputVCFHeader, final SampleList sam
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags
if (keepCombined) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_QUAL_KEY));
}
if ( dbsnp.dbsnp != null ) {
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
}
Expand Down Expand Up @@ -325,7 +335,10 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
// it difficult to recover the data mapping due to the keyed alleles no longer being present in the variant context.
final VariantContext withGenotypingAnnotations = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
final VariantContext withAnnotations = annotationEngine.finalizeAnnotations(withGenotypingAnnotations, originalVC);
result = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations);
final int[] relevantIndices = regenotypedVC.getAlleles().stream().mapToInt(a -> originalVC.getAlleles().indexOf(a)).toArray();
final VariantContext trimmed = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations);
final GenotypesContext updatedGTs = subsetAlleleSpecificFormatFields(outputHeader, trimmed.getGenotypes(), relevantIndices);
result = new VariantContextBuilder(trimmed).genotypes(updatedGTs).make();
} else if (includeNonVariants) {
result = originalVC;
} else {
Expand Down Expand Up @@ -395,6 +408,28 @@ private VariantContext removeNonRefAlleles(final VariantContext vc) {
}
}

private GenotypesContext subsetAlleleSpecificFormatFields(final VCFHeader outputHeader, final GenotypesContext originalGs, final int[] relevantIndices) {
final GenotypesContext newGTs = GenotypesContext.create(originalGs.size());
for (final Genotype g : originalGs) {
final GenotypeBuilder gb = new GenotypeBuilder(g);
final Set<String> keys = g.getExtendedAttributes().keySet();
for (final String key : keys) {
final VCFFormatHeaderLine headerLine = outputHeader.getFormatHeaderLine(key);
final Object attribute;
if (headerLine.getCountType().equals(VCFHeaderLineCount.INTEGER) && headerLine.getCount() == 1) {
attribute = g.getAnyAttribute(key);
}
else {
attribute = ReferenceConfidenceVariantContextMerger.generateAnnotationValueVector(headerLine.getCountType(),
GATKProtectedVariantContextUtils.attributeToList(g.getAnyAttribute(key)), relevantIndices);
}
gb.attribute(key, attribute);
}
newGTs.add(gb.make());
}
return newGTs;
}

/**
* Re-genotype (and re-annotate) a combined genomic VC
* @return a new VariantContext or null if the site turned monomorphic and we don't want such sites
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,26 @@ protected int[][] getTableFromSamples( final GenotypesContext genotypes, final i
return foundData ? decodeSBBS(sbArray) : null ;
}

@SuppressWarnings("unchecked")
public static int[] getStrandCounts(final Genotype g) {
int[] data;
if ( g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(String.class)) {
final String sbbsString = (String)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
data = encodeSBBS(sbbsString);
} else if (g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY).getClass().equals(ArrayList.class)) {
@SuppressWarnings("unchecked")
final List<Integer> sbbsList = (ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY);
data = encodeSBBS(sbbsList);
if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof Integer) {
data = encodeSBBS((ArrayList<Integer>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY));
}
else if (((ArrayList<Object>)g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)).get(0) instanceof String) {
final List<Integer> sbbsList = new ArrayList<>();
for (final Object o : (ArrayList<Object>) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) {
sbbsList.add(Integer.parseInt(o.toString()));
}
data = encodeSBBS(sbbsList);
} else {
throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
}

} else {
throw new GATKException("Unexpected " + GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY + " type");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ public void onTraversalStart() {
final List<String> samples = getHeaderForVariants().getGenotypeSamples();
variantSamples = new IndexedSampleList(samples);

annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, comps, false);
annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, comps, false, false);
annotatorEngine.addExpressions(expressionsToUse, resources, expressionAlleleConcordance );

// setup the header fields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,28 @@ public final class VariantAnnotatorEngine {
private final VariantOverlapAnnotator variantOverlapAnnotator;
private boolean expressionAlleleConcordance;
private final boolean useRawAnnotations;
private final boolean keepRawCombinedAnnotations;

private final static Logger logger = LogManager.getLogger(VariantAnnotatorEngine.class);

/**
* Creates an annotation engine from a list of selected annotations output from command line parsing
* @param annotationList list of annotation objects (with any parameters already filled) to include
* @param dbSNPInput input for variants from a known set from DbSNP or null if not provided.
* The annotation engine will mark variants overlapping anything in this set using {@link htsjdk.variant.vcf.VCFConstants#DBSNP_KEY}.
* The annotation engine will mark variants overlapping anything in this set using {@link VCFConstants#DBSNP_KEY}.
* @param featureInputs list of inputs with known variants.
* The annotation engine will mark variants overlapping anything in those sets using the name given by {@link FeatureInput#getName()}.
* Note: the DBSNP FeatureInput should be passed in separately, and not as part of this List - an GATKException will be thrown otherwise.
* Note: there are no non-DBSNP comparison FeatureInputs an empty List should be passed in here, rather than null.
* The annotation engine will mark variants overlapping anything in those sets using the name given by {@link FeatureInput#getName()}.
* Note: the DBSNP FeatureInput should be passed in separately, and not as part of this List - an GATKException will be thrown otherwise.
* Note: there are no non-DBSNP comparison FeatureInputs an empty List should be passed in here, rather than null.
* @param useRaw When this is set to true, the annotation engine will call {@link ReducibleAnnotation#annotateRawData(ReferenceContext, VariantContext, ReadLikelihoods)}
* on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, ReadLikelihoods)},
* which is the default for all annotations.
* on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, ReadLikelihoods)},
* @param keepCombined If true, retain the combined raw annotation values instead of removing them after finalizing
*/
public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw){
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw,
boolean keepCombined){
Utils.nonNull(featureInputs, "comparisonFeatureInputs is null");
infoAnnotations = new ArrayList<>();
genotypeAnnotations = new ArrayList<>();
Expand All @@ -69,6 +71,7 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
variantOverlapAnnotator = initializeOverlapAnnotator(dbSNPInput, featureInputs);
reducibleKeys = new HashSet<>();
useRawAnnotations = useRaw;
keepRawCombinedAnnotations = keepCombined;
for (InfoFieldAnnotation annot : infoAnnotations) {
if (annot instanceof ReducibleAnnotation) {
reducibleKeys.add(((ReducibleAnnotation) annot).getRawKeyName());
Expand Down Expand Up @@ -118,12 +121,17 @@ public Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
* Returns the set of descriptions to be added to the VCFHeader line (for all annotations in this engine).
* @param useRaw Whether to prefer reducible annotation raw key descriptions over their normal descriptions
*/
public Set<VCFHeaderLine> getVCFAnnotationDescriptions(boolean useRaw) {
public Set<VCFHeaderLine> getVCFAnnotationDescriptions(final boolean useRaw) {
final Set<VCFHeaderLine> descriptions = new LinkedHashSet<>();

for ( final InfoFieldAnnotation annotation : infoAnnotations) {
if (annotation instanceof ReducibleAnnotation && useRaw) {
descriptions.addAll(((ReducibleAnnotation)annotation).getRawDescriptions());
if (annotation instanceof ReducibleAnnotation) {
if (useRaw || keepRawCombinedAnnotations) {
descriptions.addAll(((ReducibleAnnotation) annotation).getRawDescriptions());
}
if (!useRaw) {
descriptions.addAll(annotation.getDescriptions());
}
} else {
descriptions.addAll(annotation.getDescriptions());
}
Expand Down Expand Up @@ -221,7 +229,9 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
variantAnnotations.putAll(annotationsFromCurrentType);
}
//clean up raw annotation data after annotations are finalized
variantAnnotations.remove(currentASannotation.getRawKeyName());
if (!keepRawCombinedAnnotations) {
variantAnnotations.remove(currentASannotation.getRawKeyName());
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand Down Expand Up @@ -221,7 +220,7 @@ public void onTraversalStart() {
}

final VariantAnnotatorEngine variantAnnotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(),
hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
hcEngine = new HaplotypeCallerEngine(hcArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), getReferenceReader(referenceArguments), variantAnnotatorEngine);

// The HC engine will make the right kind (VCF or GVCF) of writer for us
Expand Down
Loading

0 comments on commit 2c066f5

Please sign in to comment.