-
Notifications
You must be signed in to change notification settings - Fork 592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ReferenceConfidenceVariantContextMerger fixes for spanning deletions, attribute merging. #4680
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,8 @@ | |
import htsjdk.samtools.util.Locatable; | ||
import htsjdk.variant.variantcontext.*; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import htsjdk.variant.vcf.VCFInfoHeaderLine; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine; | ||
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotationData; | ||
|
@@ -28,12 +30,17 @@ | |
public final class ReferenceConfidenceVariantContextMerger { | ||
|
||
private final GenotypeLikelihoodCalculators calculators; | ||
private final VCFHeader vcfInputHeader; | ||
protected final VariantAnnotatorEngine annotatorEngine; | ||
protected final OneShotLogger warning = new OneShotLogger(this.getClass()); | ||
protected final OneShotLogger oneShotAnnotationLogger = new OneShotLogger(this.getClass()); | ||
protected final OneShotLogger oneShotHeaderLineLogger = new OneShotLogger(this.getClass()); | ||
|
||
public ReferenceConfidenceVariantContextMerger(VariantAnnotatorEngine engine, final VCFHeader inputHeader) { | ||
Utils.nonNull(inputHeader, "A VCF header must be provided"); | ||
|
||
public ReferenceConfidenceVariantContextMerger(VariantAnnotatorEngine engine){ | ||
calculators = new GenotypeLikelihoodCalculators(); | ||
annotatorEngine = engine; | ||
vcfInputHeader = inputHeader; | ||
} | ||
|
||
/** | ||
|
@@ -174,6 +181,9 @@ static List<Allele> remapAlleles(final VariantContext vc, final Allele refAllele | |
for (final Allele a : vc.getAlternateAlleles()) { | ||
if (a.isSymbolic()) { | ||
result.add(a); | ||
} else if ( a == Allele.SPAN_DEL ) { | ||
// add SPAN_DEL directly so we don't try to extend the bases | ||
result.add(a); | ||
} else if (a.isCalled()) { | ||
result.add(extendAllele(a, extraBaseCount, refBases)); | ||
} else { // NO_CALL and strange miscellanea | ||
|
@@ -379,19 +389,41 @@ private void addReferenceConfidenceAttributes(final VCWithNewAlleles vcPair, fin | |
annotationMap.put(key, values); | ||
} | ||
try { | ||
values.add(parseNumber(value.toString())); | ||
values.add(parseNumericInfoAttributeValue(vcfInputHeader, key, value.toString())); | ||
} catch (final NumberFormatException e) { | ||
warning.warn(String.format("Detected invalid annotations: When trying to merge variant contexts at location %s:%d the annotation %s was not a numerical value and was ignored",vcPair.getVc().getContig(),vcPair.getVc().getStart(),p.toString())); | ||
oneShotAnnotationLogger.warn(String.format("Detected invalid annotations: When trying to merge variant contexts at location %s:%d the annotation %s was not a numerical value and was ignored",vcPair.getVc().getContig(),vcPair.getVc().getStart(),p.toString())); | ||
} | ||
} | ||
} | ||
} | ||
|
||
private Comparable<?> parseNumber(String stringValue) { | ||
if (stringValue.contains(".")) { | ||
return Double.parseDouble(stringValue); | ||
} else { | ||
return Integer.parseInt(stringValue); | ||
// Use the VCF header's declared type for the given attribute to ensure that all the values for that attribute | ||
// across all the VCs being merged have the same boxed representation. Some VCs have a serialized value of "0" | ||
// for FLOAT attributes, with no embedded decimal point, but we still need to box those into Doubles, or the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of all the terrible things that happen to annotations in htsjdk, at least it guarantees some decimal precision (VCFEncoder.formatVCFDouble()). I'm guessing this was an issue related to GenomicsDB output (which uses htslib)? I mentioned this to Karthik a while ago (#4047 (comment)) It should have been addressed by #4261, but I haven't tested that explicitly. Not that I object to the belt and suspenders approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ldgauthier Good to know - thanks. At least two users have reported this, one of them seemed to implicate bcftools, but that may be a red herring. I do think htsjdk can produce such output though, since it appears to rely on the in-memory representation of the value provided by the caller, rather than the header, to determine the format. I’d love to tighten that up in the next round of htsjdk updates. |
||
// subsequent sorting required to obtain the median will fail due to the list having a mix of Comparable<Integer> | ||
// and Comparable<Double>. | ||
private Comparable<?> parseNumericInfoAttributeValue(final VCFHeader vcfHeader, final String key, final String stringValue) { | ||
final VCFInfoHeaderLine infoLine = vcfHeader.getInfoHeaderLine(key); | ||
if (infoLine == null) { | ||
oneShotHeaderLineLogger.warn(String.format("At least one attribute was found (%s) for which there is no corresponding header line", key)); | ||
if (stringValue.contains(".")) { | ||
return Double.parseDouble(stringValue); | ||
} else { | ||
return Integer.parseInt(stringValue); | ||
} | ||
} | ||
switch (infoLine.getType()) { | ||
case Integer: | ||
return Integer.parseInt(stringValue); | ||
case Float: | ||
return Double.parseDouble(stringValue); | ||
default: | ||
throw new NumberFormatException( | ||
String.format( | ||
"The VCF header specifies type %s type for INFO attribute key %s, but a numeric value is required", | ||
infoLine.getType().name(), | ||
key) | ||
); | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is really gross but it looks like the best we can do. It really seems like a bug to me that an Allele.SPAN_DEL is not symbolic but i guess thats an HTSJDK issue isn't it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, it really does seem like it should be symbolic. And there is a proposal/discussion about that here. Not sure how disruptive that change would be, but for now, this matches GATK3.