Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding argument to GenotypeGVCFs to keep only RAW_GT_COUNT #7996

Merged
merged 13 commits into from
Oct 24, 2022
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.cmdline.GATKPlugin;

import com.google.common.collect.Lists;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;

import java.io.File;
import java.lang.reflect.Modifier;
import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -485,6 +484,16 @@ public List<Annotation> getResolvedInstances() {
return resolvedInstances;
}

/**
* Returns a map of the String to Annotations only in the resolved instances.
*
* @return a Map of Strings to Annotations of resolved instances
*/
public Map<String, Annotation> getResolvedInstancesMap() {
return allDiscoveredAnnotations.entrySet().stream()
.filter(e -> getResolvedInstances().contains(e.getValue()))
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
}

/**
* Return the class representing the instance of the plugin specified by {@code pluginName}
Expand All @@ -496,5 +505,4 @@ public List<Annotation> getResolvedInstances() {
public Class<?> getClassForPluginHelp(final String pluginName) {
return allDiscoveredAnnotations.containsKey(pluginName) ? allDiscoveredAnnotations.get(pluginName).getClass() : null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
Expand All @@ -17,6 +19,7 @@
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantLocusWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
Expand All @@ -28,6 +31,7 @@
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -132,17 +136,17 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
doc = "LOD threshold to emit variant to VCF.")
protected double tlodThreshold = 3.5; //allow for some lower quality variants


/**
* Margin of error in allele fraction to consider a somatic variant homoplasmic, i.e. if there is less than a 0.1% reference allele fraction, those reads are likely errors
*/
@Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic")
protected double afTolerance = 1e-3; //based on Q30 as a "good" base quality score

/**
* If specified, keep the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations
* If specified, keep all the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations. See {@link ReducibleAnnotation}
*/
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations")
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations",
mutex = {GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME})
protected boolean keepCombined = false;

@ArgumentCollection
Expand Down Expand Up @@ -172,6 +176,9 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@ArgumentCollection
private final DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

// @ArgumentCollection deliberately omitted since this is passed to the annotation plugin
final GenotypeGVCFsAnnotationArgumentCollection genotypeGVCFsAnnotationArgs = new GenotypeGVCFsAnnotationArgumentCollection();

// the annotation engine
private VariantAnnotatorEngine annotationEngine;

Expand Down Expand Up @@ -221,6 +228,16 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
@Override
public boolean useVariantAnnotations() { return true;}

@Override
public List<? extends CommandLinePluginDescriptor<?>> getPluginDescriptors() {
GATKReadFilterPluginDescriptor readFilterDescriptor = new GATKReadFilterPluginDescriptor(getDefaultReadFilters());
return useVariantAnnotations()?
Arrays.asList(readFilterDescriptor, new GATKAnnotationPluginDescriptor(
genotypeGVCFsAnnotationArgs,
getDefaultVariantAnnotations(), getDefaultVariantAnnotationGroups())):
Collections.singletonList(readFilterDescriptor);
}

@Override
public List<Class<? extends Annotation>> getDefaultVariantAnnotationGroups() {
return Arrays.asList(StandardAnnotation.class);
Expand Down Expand Up @@ -261,8 +278,9 @@ public void onTraversalStart() {
intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

Collection<Annotation> variantAnnotations = makeVariantAnnotations();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Set<Annotation> annotationsToKeep = getAnnotationsToKeep();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);

Expand All @@ -279,6 +297,17 @@ public void onTraversalStart() {

}

private Set<Annotation> getAnnotationsToKeep() {
final GATKAnnotationPluginDescriptor pluginDescriptor = getCommandLineParser().getPluginDescriptor(GATKAnnotationPluginDescriptor.class);
final List<String> annotationStringsToKeep = genotypeGVCFsAnnotationArgs.getKeepSpecifiedCombinedAnnotationNames();
final Map<String, Annotation> resolvedInstancesMap = pluginDescriptor.getResolvedInstancesMap();
return annotationStringsToKeep.stream()
.peek(s -> {Annotation a = resolvedInstancesMap.get(s); if (a == null)
throw new UserException("Requested --" + GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME + ": " + s + " was not found in annotation list. Was it excluded with --" + StandardArgumentDefinitions.ANNOTATIONS_TO_EXCLUDE_LONG_NAME + " or not provided with --" + StandardArgumentDefinitions.ANNOTATION_LONG_NAME + "?"); })
.map(resolvedInstancesMap::get)
.collect(Collectors.toSet());
}

@Override
public void apply(final Locatable loc, List<VariantContext> variants, ReadsContext reads, ReferenceContext ref, FeatureContext features) {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.broadinstitute.hellbender.tools.walkers;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.DefaultGATKVariantAnnotationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class GenotypeGVCFsAnnotationArgumentCollection extends DefaultGATKVariantAnnotationArgumentCollection {
private static final long serialVersionUID = 1L;

public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME = "keep-specific-combined-raw-annotation";
public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME = "keep-specific-combined";

/**
* Keep only the specific combined raw annotations specified. Cannot be used with --keep-combined-raw-annotations which saves all raw annotations.
* Duplicate values will be ignored. See {@link ReducibleAnnotation} for more information on raw annotations.
*/
@Argument(fullName= KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME, shortName = KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME, optional = true,
mutex = {GenotypeGVCFs.KEEP_COMBINED_LONG_NAME},
doc="Keep only the specific combined raw annotations specified (removing the other raw annotations). Duplicate values will be ignored.")
protected List<String> keepSpecifiedCombined = new ArrayList<>();

public List<String> getKeepSpecifiedCombinedAnnotationNames() {return Collections.unmodifiableList(keepSpecifiedCombined);}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import htsjdk.variant.vcf.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFsAnnotationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -43,6 +45,7 @@ public final class VariantAnnotatorEngine {
private boolean expressionAlleleConcordance;
private final boolean useRawAnnotations;
private final boolean keepRawCombinedAnnotations;
private final List<String> rawAnnotationsToKeep;

private final static Logger logger = LogManager.getLogger(VariantAnnotatorEngine.class);
private final static OneShotLogger jumboAnnotationsLogger = new OneShotLogger(VariantAnnotatorEngine.class);
Expand All @@ -59,17 +62,21 @@ public final class VariantAnnotatorEngine {
* @param useRaw When this is set to true, the annotation engine will call {@link ReducibleAnnotation#annotateRawData(ReferenceContext, VariantContext, AlleleLikelihoods)}
* on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, AlleleLikelihoods)},
* @param keepCombined If true, retain the combined raw annotation values instead of removing them after finalizing
* @param rawAnnotationsToKeep List of raw annotations to keep even when others are removed
*/
public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw,
boolean keepCombined){
final boolean keepCombined,
final Collection<Annotation> rawAnnotationsToKeep){
Utils.nonNull(featureInputs, "comparisonFeatureInputs is null");
infoAnnotations = new ArrayList<>();
genotypeAnnotations = new ArrayList<>();
jumboInfoAnnotations = new ArrayList<>();
jumboGenotypeAnnotations = new ArrayList<>();
final List<String> variantAnnotationKeys = new ArrayList<>();
final List<String> rawVariantAnnotationKeysToKeep = new ArrayList<>();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One other minor nit: why introduce this intermediate alias ? To me having the additional alias with a slightly different name just obfuscates whats going on - I'd suggest just allocating and using the rawAnnotationsToKeep instance variable directly, since this winds up being assigned there anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, good catch. I fixed this and removed variantAnnotationKeys since it wasn't being used anymore (after moving the error message to GenotypeGVCFs)

for (Annotation annot : annotationList) {
if (annot instanceof InfoFieldAnnotation) {
infoAnnotations.add((InfoFieldAnnotation) annot);
Expand All @@ -82,11 +89,16 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
} else {
throw new GATKException.ShouldNeverReachHereException("Unexpected annotation type: " + annot.getClass().getName());
}
variantAnnotationKeys.addAll(((VariantAnnotation) annot).getKeyNames());
}
variantOverlapAnnotator = initializeOverlapAnnotator(dbSNPInput, featureInputs);
reducibleKeys = new LinkedHashSet<>();
useRawAnnotations = useRaw;
keepRawCombinedAnnotations = keepCombined;
for (final Annotation rawAnnot : rawAnnotationsToKeep) {
rawVariantAnnotationKeysToKeep.addAll(((VariantAnnotation) rawAnnot).getKeyNames());
}
this.rawAnnotationsToKeep = rawVariantAnnotationKeysToKeep;
for (InfoFieldAnnotation annot : infoAnnotations) {
if (annot instanceof ReducibleAnnotation) {
for (final String rawKey : ((ReducibleAnnotation) annot).getRawKeyNames()) {
Expand All @@ -96,6 +108,14 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
}
}

public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw,
boolean keepCombined){
this(annotationList, dbSNPInput, featureInputs, useRaw, keepCombined, Collections.emptyList());
}

private VariantOverlapAnnotator initializeOverlapAnnotator(final FeatureInput<VariantContext> dbSNPInput, final List<FeatureInput<VariantContext>> featureInputs) {
final Map<FeatureInput<VariantContext>, String> overlaps = new LinkedHashMap<>();
for ( final FeatureInput<VariantContext> fi : featureInputs) {
Expand Down Expand Up @@ -253,6 +273,14 @@ public Map<String, Object> combineAnnotations(final List<Allele> allelesList, Ma
public VariantContext finalizeAnnotations(VariantContext vc, VariantContext originalVC) {
final Map<String, Object> variantAnnotations = new LinkedHashMap<>(vc.getAttributes());

//save annotations that have been requested to be kept
final Map<String, Object> savedRawAnnotations = new LinkedHashMap<>();
for(final String rawAnnot : rawAnnotationsToKeep) {
if (variantAnnotations.containsKey(rawAnnot)) {
savedRawAnnotations.put(rawAnnot, variantAnnotations.get(rawAnnot));
}
}

// go through all the requested info annotationTypes
for (final InfoFieldAnnotation annotationType : infoAnnotations) {
if (annotationType instanceof ReducibleAnnotation) {
Expand Down Expand Up @@ -280,6 +308,8 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
variantAnnotations.remove(GATKVCFConstants.VARIANT_DEPTH_KEY);
variantAnnotations.remove(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY);
}
//add back raw annotations that have specifically been requested to keep
variantAnnotations.putAll(savedRawAnnotations);

// generate a new annotated VC
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(variantAnnotations);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
* An interface for annotations that are calculated using raw data across samples, rather than the median (or median of median) of samples values
* The Raw annotation keeps some summary (one example might be a histogram of the raw values for each sample) of the individual sample (or allele)
* level annotation. As the annotations are combined across multiple samples the raw annotation continues to contain individual values while
* the final reduced annotation will typically be a summary statistic from these raw values.
*
*/
public interface ReducibleAnnotation extends Annotation {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addInfoLine(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
addInfoLine(new VCFInfoHeaderLine(AS_INBREEDING_COEFFICIENT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
addInfoLine(new VCFInfoHeaderLine(EXCESS_HET_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value for exact test of excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(LIKELIHOOD_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods"));
addInfoLine(new VCFInfoHeaderLine(MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score From Wilcoxon rank sum test of each Alt vs. Ref read mapping qualities"));
Expand Down
Loading