From d560cb53145a8fe6fe0cff8b0c897b5ddaa4a825 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Wed, 15 Aug 2018 13:29:27 -0400 Subject: [PATCH] Add option to recalculate the allele fraction based on AD instead of the Bayesian estimate --- .../walkers/annotator/AlleleFraction.java | 62 +++++++++++++++++++ .../annotator/DepthPerAlleleBySample.java | 2 +- .../walkers/mutect/M2ArgumentCollection.java | 5 ++ .../mutect/SomaticGenotypingEngine.java | 13 ++-- .../mutect/Mutect2IntegrationTest.java | 29 +++++++++ .../large/mutect/highDPMTsnippet.bai | 3 + .../large/mutect/highDPMTsnippet.bam | 3 + 7 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AlleleFraction.java create mode 100644 src/test/resources/large/mutect/highDPMTsnippet.bai create mode 100644 src/test/resources/large/mutect/highDPMTsnippet.bam diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AlleleFraction.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AlleleFraction.java new file mode 100644 index 00000000000..44e616e74b5 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AlleleFraction.java @@ -0,0 +1,62 @@ +package org.broadinstitute.hellbender.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFStandardHeaderLines; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.hellbender.utils.help.HelpConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; + +import java.util.*; + +/** + * Created by gauthier on 8/15/18. + * Bug Laura to fill this in + */ +@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Variant allele fraction for a genotype") + +public final class AlleleFraction extends GenotypeAnnotation implements StandardMutectAnnotation { + @Override + public void annotate(final ReferenceContext ref, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final ReadLikelihoods likelihoods) { + Utils.nonNull(gb, "gb is null"); + Utils.nonNull(vc, "vc is null"); + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( g == null || !g.isCalled() || g.hasExtendedAttribute(getKeyNames().get(0))) { //don't overwrite AF based on Bayesian estimate if it already exists + return; + } + + for ( final Genotype genotype : genotypes ) { + if ( genotype.hasAD() ) { + final int[] AD = genotype.getAD(); + final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(Arrays.stream(AD).mapToDouble(x -> x*1.0).toArray()); + gb.attribute(getKeyNames().get(0), Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length)); //omit the first entry of the array corresponding to the reference + } + // if there is no AD value calculate it now using likelihoods + else if (likelihoods != null) { + DepthPerAlleleBySample adCalc = new DepthPerAlleleBySample(); + final int[] AD = adCalc.annotateWithLikelihoods(vc, g, new LinkedHashSet<>(vc.getAlleles()), likelihoods); + final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(Arrays.stream(AD).mapToDouble(x -> x*1.0).toArray()); + gb.attribute(getKeyNames().get(0), Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length)); //omit the first entry of the array corresponding to the reference + } + } + } + + @Override + public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.ALLELE_FRACTION_KEY);} + + @Override + public List getDescriptions() { + return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0))); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java index dc4cccc6a1b..836ccef271f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java @@ -95,7 +95,7 @@ private int[] annotateWithPileup(final VariantContext vc, List pi return counts; } - private int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set alleles, ReadLikelihoods likelihoods) { + protected int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set alleles, ReadLikelihoods likelihoods) { final Map alleleCounts = new LinkedHashMap<>(); for ( final Allele allele : vc.getAlleles() ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index eff28f11659..c7f8fa2ca53 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -36,6 +36,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts"; public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors"; + public static final String GET_AF_FROM_AD_LONG_NAME = "get-af-from-ad"; public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8; public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6; @@ -164,4 +165,8 @@ public double getDefaultAlleleFrequency() { @Argument(fullName= IGNORE_ITR_ARTIFACTS_LONG_NAME, doc="Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats.", optional = true) public boolean dontClipITRArtifacts = false; + @Advanced + @Argument(fullName = GET_AF_FROM_AD_LONG_NAME, doc="Use allelic depth to calculate tumor allele fraction; recommended for mitochondrial applications", optional = true) + public boolean calculateAFfromAD = false; + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java index 134a1c59184..ee15b5091c2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java @@ -235,10 +235,13 @@ private void addGenotypes(final LikelihoodMatrix tumorLog10Matrix, final Optional> normalLog10Matrix, final VariantContextBuilder callVcb) { final double[] tumorAlleleCounts = getEffectiveCounts(tumorLog10Matrix); - final Genotype tumorGenotype = new GenotypeBuilder(tumorSample, tumorLog10Matrix.alleles()) - .AD(Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray()) - .attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, getAltAlleleFractions(tumorAlleleCounts)) - .make(); + final int[] adArray = Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray(); + final int dp = (int) MathUtils.sum(adArray); + final GenotypeBuilder gb = new GenotypeBuilder(tumorSample, tumorLog10Matrix.alleles()); + if (!MTAC.calculateAFfromAD) { + gb.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, getAltAlleleFractions(Arrays.stream(tumorAlleleCounts).map(x -> (double) FastMath.round(x)/dp).toArray())); + } + final Genotype tumorGenotype = gb.make(); final List genotypes = new ArrayList<>(Arrays.asList(tumorGenotype)); // TODO: We shouldn't always assume that the genotype in the normal is hom ref @@ -269,7 +272,7 @@ public static double[] getEffectiveCounts(final LikelihoodMatrix log10Li private static double[] getAltAlleleFractions(final double[] alleleCounts) { final double[] allAlleleFractions = MathUtils.normalizeFromRealSpace(alleleCounts); - return Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length); + return Arrays.copyOfRange(allAlleleFractions, 1, allAlleleFractions.length); //omit the first entry of the array corresponding to the reference } /** * Calculate the log10 likelihoods of the ref/alt het genotype for each alt allele, then subtracts diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java index 08576deed9d..0a86260e822 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java @@ -22,6 +22,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import scala.tools.nsc.transform.patmat.ScalaLogic; import java.io.File; import java.nio.file.Files; @@ -49,6 +50,8 @@ public class Mutect2IntegrationTest extends CommandLineProgramTest { private static final File NA12878_MITO_BAM = new File(toolsTestDir, "mutect/mito/NA12878.bam"); private static final File MITO_REF = new File(toolsTestDir, "mutect/mito/Homo_sapiens_assembly38.mt_only.fasta"); + private static final File DEEP_MITO_BAM = new File(largeFileTestDir, "mutect/highDPMTsnippet.bam"); + private static final String DEEP_MITO_SAMPLE_NAME = "mixture"; /** * Several DREAM challenge bams with synthetic truth data. In order to keep file sizes manageable, bams are restricted @@ -499,7 +502,33 @@ public void testMitochondria() throws Exception { Assert.assertTrue(expectedKeys.stream().allMatch(variantKeys::contains)); } + @Test + @SuppressWarnings("deprecated") + public void testAFfromADoverHighDP() throws Exception { + Utils.resetRandomGenerator(); + final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); + + final List args = Arrays.asList("-I", DEEP_MITO_BAM.getAbsolutePath(), + "-" + M2ArgumentCollection.TUMOR_SAMPLE_SHORT_NAME, DEEP_MITO_SAMPLE_NAME, + "-R", MITO_REF.getAbsolutePath(), + "-L", "chrM:1-1018", + "-ip", "300", + "-min-pruning", "4", + "--" + M2ArgumentCollection.GET_AF_FROM_AD_LONG_NAME, + "-O", unfilteredVcf.getAbsolutePath()); + runCommandLine(args); + + final List variants = VariantContextTestUtils.streamVcf(unfilteredVcf).collect(Collectors.toList()); + for (final VariantContext vc : variants) { + Assert.assertTrue(vc.isBiallelic()); //I do some lazy parsing below that won't hold for multiple alternate alleles + Genotype g = vc.getGenotype(DEEP_MITO_SAMPLE_NAME); + Assert.assertTrue(g.hasAD()); + final int[] ADs = g.getAD(); + Assert.assertTrue(g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY)); + Assert.assertEquals(Double.parseDouble(String.valueOf(vc.getGenotype(DEEP_MITO_SAMPLE_NAME).getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY,"0"))), (double)ADs[1]/(ADs[0]+ADs[1]), 1e-6); + } + } @DataProvider(name="bamoutVariations") public Object[][] bamoutVariations() { diff --git a/src/test/resources/large/mutect/highDPMTsnippet.bai b/src/test/resources/large/mutect/highDPMTsnippet.bai new file mode 100644 index 00000000000..083b9dd8192 --- /dev/null +++ b/src/test/resources/large/mutect/highDPMTsnippet.bai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728bbc3f4ee56a05b60faabbcecdd520e833370a303a7d687eaf4411d15bb932 +size 96 diff --git a/src/test/resources/large/mutect/highDPMTsnippet.bam b/src/test/resources/large/mutect/highDPMTsnippet.bam new file mode 100644 index 00000000000..71f299655fa --- /dev/null +++ b/src/test/resources/large/mutect/highDPMTsnippet.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0546ea256ac2cc2fc511e6d0dd288346c9c844dc5e0bc9f17f7f46b5f8e1e0 +size 1478779