Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ReferenceConfidenceVariantContextMerger fixes for spanning deletions, attribute merging. #4680

Merged
merged 1 commit into from
Apr 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ public void onTraversalStart() {

vcfWriter = getVCFWriter();

referenceConfidenceVariantContextMerger = new ReferenceConfidenceVariantContextMerger(annotationEngine);
referenceConfidenceVariantContextMerger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants());

//now that we have all the VCF headers, initialize the annotations (this is particularly important to turn off RankSumTest dithering in integration tests)'
sequenceDictionary = getBestAvailableSequenceDictionary();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ public void onTraversalStart() {
// We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations.
genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY));

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine);
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants());

setupVCFWriter(inputVCFHeader, samples);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotationData;
Expand All @@ -28,12 +30,17 @@
public final class ReferenceConfidenceVariantContextMerger {

private final GenotypeLikelihoodCalculators calculators;
private final VCFHeader vcfInputHeader;
protected final VariantAnnotatorEngine annotatorEngine;
protected final OneShotLogger warning = new OneShotLogger(this.getClass());
protected final OneShotLogger oneShotAnnotationLogger = new OneShotLogger(this.getClass());
protected final OneShotLogger oneShotHeaderLineLogger = new OneShotLogger(this.getClass());

public ReferenceConfidenceVariantContextMerger(VariantAnnotatorEngine engine, final VCFHeader inputHeader) {
Utils.nonNull(inputHeader, "A VCF header must be provided");

public ReferenceConfidenceVariantContextMerger(VariantAnnotatorEngine engine){
calculators = new GenotypeLikelihoodCalculators();
annotatorEngine = engine;
vcfInputHeader = inputHeader;
}

/**
Expand Down Expand Up @@ -174,6 +181,9 @@ static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele
for (final Allele a : vc.getAlternateAlleles()) {
if (a.isSymbolic()) {
result.add(a);
} else if ( a == Allele.SPAN_DEL ) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really gross but it looks like the best we can do. It really seems like a bug to me that an Allele.SPAN_DEL is not symbolic but i guess thats an HTSJDK issue isn't it.

Copy link
Collaborator Author

@cmnbroad cmnbroad Apr 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it really does seem like it should be symbolic. And there is a proposal/discussion about that here. Not sure how disruptive that change would be, but for now, this matches GATK3.

// add SPAN_DEL directly so we don't try to extend the bases
result.add(a);
} else if (a.isCalled()) {
result.add(extendAllele(a, extraBaseCount, refBases));
} else { // NO_CALL and strange miscellanea
Expand Down Expand Up @@ -379,19 +389,41 @@ private void addReferenceConfidenceAttributes(final VCWithNewAlleles vcPair, fin
annotationMap.put(key, values);
}
try {
values.add(parseNumber(value.toString()));
values.add(parseNumericInfoAttributeValue(vcfInputHeader, key, value.toString()));
} catch (final NumberFormatException e) {
warning.warn(String.format("Detected invalid annotations: When trying to merge variant contexts at location %s:%d the annotation %s was not a numerical value and was ignored",vcPair.getVc().getContig(),vcPair.getVc().getStart(),p.toString()));
oneShotAnnotationLogger.warn(String.format("Detected invalid annotations: When trying to merge variant contexts at location %s:%d the annotation %s was not a numerical value and was ignored",vcPair.getVc().getContig(),vcPair.getVc().getStart(),p.toString()));
}
}
}
}

private Comparable<?> parseNumber(String stringValue) {
if (stringValue.contains(".")) {
return Double.parseDouble(stringValue);
} else {
return Integer.parseInt(stringValue);
// Use the VCF header's declared type for the given attribute to ensure that all the values for that attribute
// across all the VCs being merged have the same boxed representation. Some VCs have a serialized value of "0"
// for FLOAT attributes, with no embedded decimal point, but we still need to box those into Doubles, or the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of all the terrible things that happen to annotations in htsjdk, at least it guarantees some decimal precision (VCFEncoder.formatVCFDouble()). I'm guessing this was an issue related to GenomicsDB output (which uses htslib)? I mentioned this to Karthik a while ago (#4047 (comment)) It should have been addressed by #4261, but I haven't tested that explicitly. Not that I object to the belt and suspenders approach.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ldgauthier Good to know - thanks. At least two users have reported this, one of them seemed to implicate bcftools, but that may be a red herring. I do think htsjdk can produce such output though, since it appears to rely on the in-memory representation of the value provided by the caller, rather than the header, to determine the format. I’d love to tighten that up in the next round of htsjdk updates.

// subsequent sorting required to obtain the median will fail due to the list having a mix of Comparable<Integer>
// and Comparable<Double>.
private Comparable<?> parseNumericInfoAttributeValue(final VCFHeader vcfHeader, final String key, final String stringValue) {
final VCFInfoHeaderLine infoLine = vcfHeader.getInfoHeaderLine(key);
if (infoLine == null) {
oneShotHeaderLineLogger.warn(String.format("At least one attribute was found (%s) for which there is no corresponding header line", key));
if (stringValue.contains(".")) {
return Double.parseDouble(stringValue);
} else {
return Integer.parseInt(stringValue);
}
}
switch (infoLine.getType()) {
case Integer:
return Integer.parseInt(stringValue);
case Float:
return Double.parseDouble(stringValue);
default:
throw new NumberFormatException(
String.format(
"The VCF header specifies type %s type for INFO attribute key %s, but a numeric value is required",
infoLine.getType().name(),
key)
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ public Object[][] gvcfsToCombine() {
{new File[]{getTestFile("spanningDel.many.haploid.g.vcf")}, getTestFile("testMultipleSpanningDeletionsForOneSampleHaploid.vcf"), NO_EXTRA_ARGS, b37_reference_20_21},
// Simple Test, spanning deletions for tetraploid data
{new File[]{getTestFile("spanningDel.many.tetraploid.g.vcf")}, getTestFile("testMultipleSpanningDeletionsForOneSampleTetraploid.vcf"), NO_EXTRA_ARGS, b37_reference_20_21},
//Test that spanning deletion alleles in mixed sites don't get extended
{new File[]{getTestFile("spanningDeletionBaseExtensionTest1.vcf"), getTestFile("spanningDeletionBaseExtensionTest2.vcf")},
getTestFile("spanningDeletionBaseExtensionTestExpected.g.vcf"), NO_EXTRA_ARGS, b37_reference_20_21},
// Testing BasePairResolutionInputs
{new File[]{getTestFile("gvcf.basepairResolution.vcf")}, getTestFile("testBasepairResolutionInput.vcf"), Arrays.asList("-A", "ClippingRankSumTest"), b37_reference_20_21},
// Interval Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,29 @@

import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.test.VariantContextTestUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.broadinstitute.hellbender.utils.variant.GATKVCFConstants.MAP_QUAL_RANK_SUM_KEY;

/**
* Tests {@link ReferenceConfidenceVariantContextMerger}.
*
Expand All @@ -40,7 +46,7 @@ private static VariantAnnotatorEngine getAnnotationEngine() {
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final String testID, final List<VariantContext> toMerge, final Locatable loc,
final boolean returnSiteEvenIfMonomorphic, final boolean uniquifySamples, final VariantContext expectedResult) {
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine());
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine(), new VCFHeader());
final VariantContext result = merger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true, uniquifySamples);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
Expand Down Expand Up @@ -84,7 +90,7 @@ public void testGenerateADWithNewAlleles() {

@Test(expectedExceptions = UserException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine());
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine(), new VCFHeader());

final List<Allele> alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
Expand All @@ -98,7 +104,7 @@ public void testGetIndexesOfRelevantAllelesWithNoALT() {
@Test(dataProvider = "getIndexesOfRelevantAllelesData")
public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List<Allele> allAlleles) {
final List<Allele> myAlleles = new ArrayList<>(3);
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine());
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine(), new VCFHeader());

// always add the reference and <ALT> alleles
myAlleles.add(allAlleles.get(0));
Expand Down Expand Up @@ -129,7 +135,7 @@ else if ( i == allelesIndex )
// referenceConfidenceVariantContextMerger.
@Test (dataProvider = "getIndexesOfRelevantAllelesDataSpanningDels")
public void testGetIndexesOfRelevantAllelesMultiSpanningDel(final List<Allele> allelesToFind, final List<Allele> allAlleles, final Genotype g, final int expectedIndex) {
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine());
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine(), new VCFHeader());

final int[] indexes = merger.getIndexesOfRelevantAlleles(allAlleles, allelesToFind,-1, g);

Expand Down Expand Up @@ -375,4 +381,45 @@ public Object[][] getSpanningDeletionCases(){
public void testVCWithNewAllelesIsSpanningDeletion(ReferenceConfidenceVariantContextMerger.VCWithNewAlleles vcWithNewAlleles, boolean expected){
Assert.assertEquals(vcWithNewAlleles.isSpanningDeletion(), expected);
}

@Test
public void testMedianCalculationOnMixedSerializedTypes() {
// Merging attributes by median calculation requires sorting the values, which in turn requires a list
// of values with homogeneous boxed representations. Make sure that FLOAT attributes with a serialized
// representation that looks like an integer (with no decimal point, i.e. "0") get boxed into the same
// type as other floating point values for that attribute to ensure successful sorting.
final double medianRankSum = 1.46;
final VCFHeader vcfHeader = new VCFHeader();
vcfHeader.addMetaDataLine(GATKVCFHeaderLines.getInfoLine(MAP_QUAL_RANK_SUM_KEY));

final VariantContextBuilder vcBuilder = new VariantContextBuilder("vc1", "20", 10, 10, Arrays.asList(Aref));

// create 3 VCs with one each of a small value, the median value, and a large value for MQ_RankSum
final List<VariantContext> toMergeVCs = new ArrayList<>(3);

// use a literal string for this one to ensure that we have at least one test value that
// has no embedded decimal point to emulate conditions found in the wild
Map<String, Object> attributes1 = new HashMap<>();
attributes1.put(MAP_QUAL_RANK_SUM_KEY, "0");
toMergeVCs.add(vcBuilder.attributes(attributes1).make());

Map<String, Object> attributes2 = new HashMap<>();
attributes2.put(MAP_QUAL_RANK_SUM_KEY, Double.toString(medianRankSum));
toMergeVCs.add(vcBuilder.attributes(attributes2).make());

Map<String, Object> attributes3 = new HashMap<>();
attributes3.put(MAP_QUAL_RANK_SUM_KEY, "2.46");
toMergeVCs.add(vcBuilder.attributes(attributes3).make());

// merge and make sure we get the median value
final ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(getAnnotationEngine(), vcfHeader);
final VariantContext mergedVC = merger.merge(
toMergeVCs,
new SimpleInterval("20", 10, 10),
(byte) 'A',
true,
false);

Assert.assertEquals(mergedVC.getAttributeAsDouble(MAP_QUAL_RANK_SUM_KEY,-1.0), medianRankSum);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
Expand Down Expand Up @@ -86,7 +87,7 @@ public Object[][] interestingSitesCombineResults() {
@Test(dataProvider = "interestingSitesCombineResults")
public void testCombineAnnotationGATK3Concordance(List<VariantContext> VCs, VariantContext result, VariantContext genotyped) throws Exception {
VariantAnnotatorEngine annotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(Collections.emptyList(), getAnnotationsToUse(), Collections.emptyList(), null, Collections.emptyList());
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(annotatorEngine);
ReferenceConfidenceVariantContextMerger merger = new ReferenceConfidenceVariantContextMerger(annotatorEngine, new VCFHeader());
VariantContext merged = merger.merge(VCs, new SimpleInterval(result.getContig(), result.getStart(), result.getStart()), result.getReference().getBases()[0], false, false);
Assert.assertTrue(VariantContextTestUtils.alleleSpecificAnnotationEquals(merged, result, getRawKey()));
}
Expand Down
Loading