From fdff6dbad70de2aa1674063c99ecc0e83d3a6609 Mon Sep 17 00:00:00 2001
From: David Benjamin
Date: Mon, 24 Jan 2022 15:25:05 -0500
Subject: [PATCH] Huge simplication of genotyping likelihoods calculations --
no change in output
---
.../models/AlleleFractionLikelihoods.java | 10 +-
.../copynumber/models/CopyRatioSamplers.java | 6 +-
...aiveHeterozygousPileupGenotypingUtils.java | 2 +-
...ferenceConfidenceVariantContextMerger.java | 19 +-
.../contamination/ContaminationModel.java | 2 +-
.../genotyper/AlleleSubsettingUtils.java | 83 +--
.../genotyper/DRAGENGenotypesModel.java | 38 +-
.../genotyper/GenotypeAlleleCounts.java | 179 ++---
.../genotyper/GenotypeIndexCalculator.java | 206 ++++++
.../GenotypeLikelihoodCalculator.java | 625 ++++--------------
.../GenotypeLikelihoodCalculatorDRAGEN.java | 146 ++--
.../GenotypeLikelihoodCalculators.java | 418 ------------
.../walkers/genotyper/GenotypesCache.java | 89 +++
.../walkers/genotyper/GenotypingEngine.java | 5 +-
.../IndependentSampleGenotypesModel.java | 42 +-
.../afcalc/AlleleFrequencyCalculator.java | 86 ++-
.../GnarlyGenotyperEngine.java | 26 +-
.../HaplotypeCallerGenotypingEngine.java | 2 +-
.../ReferenceConfidenceModel.java | 12 +-
.../graphs/KBestHaplotype.java | 2 +-
.../tools/walkers/mutect/Mutect2Engine.java | 3 +-
.../walkers/variantutils/ReblockGVCF.java | 19 +-
.../hellbender/utils/GenotypeUtils.java | 51 +-
.../hellbender/utils/IndexRange.java | 14 +
.../hellbender/utils/Log10Cache.java | 13 -
.../hellbender/utils/Log10FactorialCache.java | 20 -
.../hellbender/utils/MannWhitneyU.java | 18 +-
.../hellbender/utils/MathUtils.java | 93 +--
.../hellbender/utils/NaturalLogUtils.java | 5 +-
.../genotyper/GenotypePriorCalculator.java | 35 +-
.../utils/recalibration/RecalDatum.java | 42 +-
.../variant/GATKVariantContextUtils.java | 19 +-
.../ModelSegmentsIntegrationTest.java | 7 +-
.../GenotypeAlleleCountsUnitTest.java | 66 +-
.../GenotypeIndexCalculatorUnitTest.java | 148 +++++
.../GenotypeLikelihoodCalculatorUnitTest.java | 132 +---
...GenotypeLikelihoodCalculatorsUnitTest.java | 103 ---
.../genotyper/GenotypesCacheUnitTest.java | 55 ++
...dependentSampleGenotypesModelUnitTest.java | 2 +-
.../AlleleFrequencyCalculatorUnitTest.java | 13 +-
.../hellbender/utils/IndexRangeUnitTest.java | 10 +
.../hellbender/utils/MathUtilsUnitTest.java | 98 +--
.../genotyper/ReadLikelihoodsUnitTester.java | 2 +-
...ocumentationGenerationIntegrationTest.java | 3 +-
.../recalibration/RecalDatumUnitTest.java | 30 +-
.../multiple-sample-ac-nac-tumor-1.af.igv.seg | 3 +-
.../multiple-sample-ac-nac-tumor-1.cr.igv.seg | 3 +-
.../multiple-sample-ac-nac-tumor-1.cr.seg | 3 +-
...-sample-ac-nac-tumor-1.modelBegin.af.param | 6 +-
...tiple-sample-ac-nac-tumor-1.modelBegin.seg | 382 +++++------
...-sample-ac-nac-tumor-1.modelFinal.af.param | 6 +-
...tiple-sample-ac-nac-tumor-1.modelFinal.seg | 3 +-
.../multiple-sample-ac-tumor-1.af.igv.seg | 3 +-
.../multiple-sample-ac-tumor-1.cr.igv.seg | 3 +-
.../multiple-sample-ac-tumor-1.cr.seg | 3 +-
...iple-sample-ac-tumor-1.modelBegin.af.param | 6 +-
.../multiple-sample-ac-tumor-1.modelBegin.seg | 576 ++++++++--------
...iple-sample-ac-tumor-1.modelFinal.af.param | 6 +-
.../multiple-sample-ac-tumor-1.modelFinal.seg | 3 +-
...ltiple-sample-cr-ac-nac-tumor-1.af.igv.seg | 14 +-
...mple-cr-ac-nac-tumor-1.modelBegin.af.param | 6 +-
...le-sample-cr-ac-nac-tumor-1.modelBegin.seg | 156 ++---
...mple-cr-ac-nac-tumor-1.modelFinal.af.param | 4 +-
...le-sample-cr-ac-nac-tumor-1.modelFinal.seg | 24 +-
.../multiple-sample-cr-ac-tumor-1.af.igv.seg | 6 +-
...e-sample-cr-ac-tumor-1.modelBegin.af.param | 6 +-
...ltiple-sample-cr-ac-tumor-1.modelBegin.seg | 222 +++----
...e-sample-cr-ac-tumor-1.modelFinal.af.param | 6 +-
...ltiple-sample-cr-ac-tumor-1.modelFinal.seg | 10 +-
.../single-sample-ac-nac.af.igv.seg | 20 +-
.../single-sample-ac-nac.cr.igv.seg | 10 +-
.../single-sample-ac-nac.cr.seg | 10 +-
.../single-sample-ac-nac.modelBegin.af.param | 6 +-
.../single-sample-ac-nac.modelBegin.seg | 92 +--
.../single-sample-ac-nac.modelFinal.af.param | 6 +-
.../single-sample-ac-nac.modelFinal.seg | 20 +-
.../single-sample-ac.modelBegin.af.param | 6 +-
.../single-sample-ac.modelBegin.seg | 136 ++--
.../single-sample-ac.modelFinal.af.param | 6 +-
.../single-sample-cr-ac-nac.af.igv.seg | 16 +-
...ingle-sample-cr-ac-nac.modelBegin.af.param | 6 +-
.../single-sample-cr-ac-nac.modelBegin.seg | 24 +-
...ingle-sample-cr-ac-nac.modelFinal.af.param | 6 +-
.../single-sample-cr-ac-nac.modelFinal.seg | 16 +-
.../single-sample-cr-ac.af.igv.seg | 10 +-
.../single-sample-cr-ac.modelBegin.af.param | 4 +-
.../single-sample-cr-ac.modelBegin.seg | 12 +-
.../single-sample-cr-ac.modelFinal.af.param | 6 +-
.../single-sample-cr-ac.modelFinal.seg | 12 +-
89 files changed, 1942 insertions(+), 2941 deletions(-)
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeIndexCalculator.java
delete mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java
create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypesCache.java
delete mode 100644 src/main/java/org/broadinstitute/hellbender/utils/Log10Cache.java
delete mode 100644 src/main/java/org/broadinstitute/hellbender/utils/Log10FactorialCache.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeIndexCalculatorUnitTest.java
delete mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculatorsUnitTest.java
create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypesCacheUnitTest.java
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
index 11b2b8c1931..43cddbe0be4 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.copynumber.models;
import org.apache.commons.math3.special.Gamma;
+import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.hellbender.utils.NaturalLogUtils;
@@ -8,8 +9,6 @@
import java.util.stream.IntStream;
import static org.apache.commons.math3.util.FastMath.sqrt;
-import static org.broadinstitute.hellbender.utils.MathUtils.log10Factorial;
-import static org.broadinstitute.hellbender.utils.MathUtils.log10ToLog;
/**
* Contains likelihood methods for the allele-fraction model.
@@ -87,10 +86,7 @@ static double hetLogLikelihood(final AlleleFractionGlobalParameters parameters,
- n * log(majorFraction + minorFraction * lambda0RefMinor);
final double refMinorLogLikelihood = logNotPi + logcRefMinor + Gamma.logGamma(rhoRefMinor) - rhoRefMinor * log(tauRefMinor);
- // changing the factorial implementation below may introduce non-negligible numerical differences;
- // note https://github.com/broadinstitute/gatk/pull/7652
- final double outlierLogLikelihood = logPi + log10ToLog(log10Factorial(a) + log10Factorial(r) - log10Factorial(a + r + 1));
-
+ final double outlierLogLikelihood = logPi - Math.log(a + r + 1) - CombinatoricsUtils.binomialCoefficientLog(a+r,a);
return NaturalLogUtils.logSumExp(altMinorLogLikelihood, refMinorLogLikelihood, outlierLogLikelihood);
}
@@ -165,6 +161,6 @@ private static double biasPosteriorEffectiveBeta(final double lambda0, final dou
}
private static double log(final double x) {
- return FastMath.log(Math.max(EPSILON, x));
+ return Math.log(Math.max(EPSILON, x));
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
index 3b5ea04b819..dc94a1d5d5c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
@@ -162,11 +162,11 @@ public CopyRatioState.OutlierIndicators sample(final RandomGenerator rng,
final CopyRatioSegmentedData data) {
logger.debug("Sampling outlier indicators...");
final double outlierUnnormalizedLogProbability =
- FastMath.log(state.outlierProbability()) + outlierUniformLogLikelihood;
+ Math.log(state.outlierProbability()) + outlierUniformLogLikelihood;
// final double notOutlierUnnormalizedLogProbabilityPrefactor =
-// FastMath.log(1. - state.outlierProbability()) - 0.5 * FastMath.log(2 * Math.PI * state.variance());
+// Math.log(1. - state.outlierProbability()) - 0.5 * Math.log(2 * Math.PI * state.variance());
final double notOutlierUnnormalizedLogProbabilityPrefactor =
- FastMath.log((1. - state.outlierProbability()) / FastMath.sqrt(2 * Math.PI * state.variance()));
+ Math.log((1. - state.outlierProbability()) / FastMath.sqrt(2 * Math.PI * state.variance()));
final List indicators = new ArrayList<>(data.getNumPoints());
for (int segmentIndex = 0; segmentIndex < data.getNumSegments(); segmentIndex++) {
final List indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segmentIndex);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java
index 97b46ea33ba..be1369fabe0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java
@@ -254,6 +254,6 @@ private static double calculateHomozygousLogRatio(final AllelicCount allelicCoun
final double betaOneMinusError = Beta.regularizedBeta(1 - genotypingBaseErrorRate, r + 1, n - r + 1);
final double betaHom = betaError + betaAll - betaOneMinusError;
final double betaHet = betaOneMinusError - betaError;
- return FastMath.log(betaHom) - FastMath.log(betaHet);
+ return Math.log(betaHom) - Math.log(betaHet);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
index f6d01b321cb..f57d50a7cad 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
@@ -11,9 +11,7 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotationData;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.AlleleSubsettingUtils;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.GenotypeUtils;
import org.broadinstitute.hellbender.utils.Utils;
@@ -34,7 +32,6 @@
@SuppressWarnings({"rawtypes","unchecked"}) //TODO fix uses of untyped Comparable.
public final class ReferenceConfidenceVariantContextMerger {
- private static final GenotypeLikelihoodCalculators calculators = new GenotypeLikelihoodCalculators();
private static VCFHeader vcfInputHeader = null;
protected final VariantAnnotatorEngine annotatorEngine;
private final boolean doSomaticMerge;
@@ -571,7 +568,6 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
- final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
for ( final Genotype g : vc.getGenotypes() ) {
final String name;
@@ -584,20 +580,17 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
if (!doSomaticMerge) {
if (g.hasPL() || g.hasAD()) {
- int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
+ int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
if (g.hasPL()) {
- // lazy initialization of the genotype index map by ploidy.
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
- ? calculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles, calculators) //probably horribly slow
+ ? GenotypeIndexCalculator.newToOldGenotypeMap(ploidy, perSampleIndexesOfRelevantAlleles) //probably horribly slow
: genotypeIndexMapsByPloidy[ploidy];
- final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
- genotypeBuilder.PL(PLs);
+ genotypeBuilder.PL(generatePL(g, genotypeIndexMapByPloidy));
}
if (g.hasAD()) {
- final int[] AD = AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles);
- genotypeBuilder.AD(AD);
+ genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}
- // clean up low confidence hom refs for better annotations later
+ //clean up low confidence hom refs for better annotations later
} else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
index 2fa40d8545a..44b07f6fcfe 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
@@ -277,7 +277,7 @@ private static double probability(final PileupSummary site, final double contami
}
private static double segmentLogLikelihood(final List segment, final double contamination, final double errorRate, final double minorAlleleFraction) {
- return segment.stream().mapToDouble(site -> FastMath.log(MathUtils.sum(genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction)))).sum();
+ return segment.stream().mapToDouble(site -> Math.log(MathUtils.sum(genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction)))).sum();
}
private static double modelLogLikelihood(final List> segments, final double contamination, final double errorRate, final List mafs) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
index 9422472b9fe..60f1ef7b205 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
@@ -4,15 +4,17 @@
import com.google.common.primitives.Doubles;
import com.google.common.primitives.Ints;
import htsjdk.variant.variantcontext.*;
-import htsjdk.variant.vcf.*;
+import htsjdk.variant.vcf.VCFConstants;
+import htsjdk.variant.vcf.VCFFormatHeaderLine;
+import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
-import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
+import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.Permutation;
+import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
@@ -37,8 +39,6 @@ private AlleleSubsettingUtils() {} // prevent instantiation
private static final OneShotLogger attributesRemovedOneShotLogger = new OneShotLogger(AlleleSubsettingUtils.class);
- private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
-
public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List originalAlleles,
final List allelesToKeep,
@@ -47,6 +47,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
//TODO: if other usages of this method should update or remove A,R, or G length annotations then header parsing is necessary and the method below should be used
return subsetAlleles(originalGs, defaultPloidy, originalAlleles, allelesToKeep, gpc, assignmentMethod, Collections.emptyList());
}
+
/**
* Create the new GenotypesContext with the subsetted PLs and ADs
*
@@ -399,12 +400,10 @@ static double[] calculateLikelihoodSums(final VariantContext vc, final int defau
final double GLDiffBetweenRefAndBestVariantGenotype = Math.abs(glsVector[indexOfMostLikelyVariantGenotype] - glsVector[PL_INDEX_OF_HOM_REF]);
final int ploidy = genotype.getPloidy() > 0 ? genotype.getPloidy() : defaultPloidy;
- final int[] alleleCounts = new GenotypeLikelihoodCalculators()
- .getInstance(ploidy, vc.getNAlleles()).genotypeAlleleCountsAt(indexOfMostLikelyVariantGenotype)
- .alleleCountsByIndex(vc.getNAlleles() - 1);
+ final GenotypeAlleleCounts mostLikelyGenotypeAlleleCounts = GenotypesCache.get(ploidy, indexOfMostLikelyVariantGenotype);
- for (int allele = 1; allele < alleleCounts.length; allele++) {
- if (alleleCounts[allele] > 0) {
+ for (int allele = 1; allele < vc.getNAlleles(); allele++) {
+ if (mostLikelyGenotypeAlleleCounts.containsAllele(allele)) {
likelihoodSums[allele] += GLDiffBetweenRefAndBestVariantGenotype;
}
}
@@ -428,10 +427,7 @@ public static int[] subsettedPLIndices(final int ploidy, final List orig
final int[] result = new int[GenotypeLikelihoods.numLikelihoods(newAlleles.size(), ploidy)];
final Permutation allelePermutation = new IndexedAlleleList<>(originalAlleles).permutation(new IndexedAlleleList<>(newAlleles));
- final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, originalAlleles.size());
- for (int oldPLIndex = 0; oldPLIndex < glCalc.genotypeCount(); oldPLIndex++) {
- final GenotypeAlleleCounts oldAlleleCounts = glCalc.genotypeAlleleCountsAt(oldPLIndex);
-
+ for (final GenotypeAlleleCounts oldAlleleCounts : GenotypeAlleleCounts.iterable(ploidy, originalAlleles.size())) {
final boolean containsOnlyNewAlleles = IntStream.range(0, oldAlleleCounts.distinctAlleleCount())
.map(oldAlleleCounts::alleleIndexAt).allMatch(allelePermutation::isKept);
@@ -441,8 +437,8 @@ public static int[] subsettedPLIndices(final int ploidy, final List orig
final int[] newAlleleCounts = IntStream.range(0, newAlleles.size()).flatMap(newAlleleIndex ->
IntStream.of(newAlleleIndex, oldAlleleCounts.alleleCountFor(allelePermutation.fromIndex(newAlleleIndex)))).toArray();
- final int newPLIndex = glCalc.alleleCountsToIndex(newAlleleCounts);
- result[newPLIndex] = oldPLIndex;
+ final int newPLIndex = GenotypeIndexCalculator.alleleCountsToIndex(newAlleleCounts);
+ result[newPLIndex] = oldAlleleCounts.index();
}
}
return result;
@@ -492,39 +488,6 @@ public static int[] getIndexesOfRelevantAllelesForGVCF(final List remapp
return indexMapping;
}
- public static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position, final Genotype g) {
- Utils.nonEmpty(remappedAlleles);
- Utils.nonEmpty(targetAlleles);
-
- final int[] indexMapping = new int[targetAlleles.size()];
-
- // the reference likelihoods should always map to each other (even if the alleles don't)
- indexMapping[0] = 0;
-
- for ( int i = 1; i < targetAlleles.size(); i++ ) {
- // if there's more than 1 spanning deletion (*) allele then we need to use the best one
- if (targetAlleles.get(i) == Allele.SPAN_DEL && g.hasPL()) {
- final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL);
- if (occurrences > 1) {
- final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy());
- if (indexOfBestDel == -1) {
- throw new IllegalArgumentException("At position " + position + " targetAlleles contains a spanning deletion, but remappedAlleles does not.");
- }
- indexMapping[i] = indexOfBestDel;
- continue;
- }
- }
-
- final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
- if (indexOfRemappedAllele == -1) {
- throw new IllegalArgumentException("At position " + position + " targetAlleles contains a " + targetAlleles.get(i) + " allele, but remappedAlleles does not.");
- }
- indexMapping[i] = indexOfRemappedAllele;
- }
-
- return indexMapping;
- }
-
/**
* Returns the index of the best spanning deletion allele based on AD counts
*
@@ -539,7 +502,8 @@ private static int indexOfBestDel(final List alleles, final int[] PLs, f
for ( int i = 0; i < alleles.size(); i++ ) {
if ( alleles.get(i) == Allele.SPAN_DEL ) {
- final int homAltIndex = findHomIndex(GL_CALCS.getInstance(ploidy, alleles.size()), i, ploidy);
+ //In the canonical order, the homozygous genotype of the ith allele is immediately followed by the first genotype containing the (i+1)th allele.
+ final int homAltIndex = (int) GenotypeIndexCalculator.indexOfFirstGenotypeWithAllele(ploidy, i +1) - 1;
final int PL = PLs[homAltIndex];
if ( PL < bestPL ) {
bestIndex = i;
@@ -551,25 +515,6 @@ private static int indexOfBestDel(final List alleles, final int[] PLs, f
return bestIndex;
}
- /** //TODO simplify these methods
- * Returns the index of the PL that represents the homozygous genotype of the given i'th allele
- *
- * @param i the index of the allele with the list of alleles
- * @param ploidy the ploidy of the sample
- * @return the hom index
- */
- private static int findHomIndex(final GenotypeLikelihoodCalculator calculator, final int i, final int ploidy) {
- // some quick optimizations for the common case
- if ( ploidy == 2 )
- return GenotypeLikelihoods.calculatePLindex(i, i);
- if ( ploidy == 1 )
- return i;
-
- final int[] alleleIndexes = new int[ploidy];
- Arrays.fill(alleleIndexes, i);
- return calculator.allelesToIndex(alleleIndexes);
- }
-
/**
* Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current
* alleles from the original AD.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/DRAGENGenotypesModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/DRAGENGenotypesModel.java
index c64f926aa5c..718f0b2662f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/DRAGENGenotypesModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/DRAGENGenotypesModel.java
@@ -37,10 +37,6 @@ public class DRAGENGenotypesModel implements GenotypingModel {
public static final double FLAT_SNP_HET_PRIOR = 34.77;
public static final double BQD_HOMOPOLYMER_PHRED_ADJUSTMENT_FACTOR = 5.0;
- private final int cacheAlleleCountCapacity;
- private final int cachePloidyCapacity;
- private GenotypeLikelihoodCalculatorDRAGEN[][] likelihoodCalculators;
- private final GenotypeLikelihoodCalculators calculators;
private final boolean computeBQD;
private final boolean computeFRD;
private final int allelePadding;
@@ -58,10 +54,6 @@ public DRAGENGenotypesModel(final boolean useBQDModel, final boolean useFRDModel
public DRAGENGenotypesModel(final int calculatorCachePloidyCapacity, final int calculatorCacheAlleleCapacity,
final boolean useBQDModel, final boolean useFRDModel, final int allelePadding,
final int maxEffectiveDepthAdjustment, final DragstrParams dragstrParams) {
- cachePloidyCapacity = calculatorCachePloidyCapacity;
- cacheAlleleCountCapacity = calculatorCacheAlleleCapacity;
- likelihoodCalculators = new GenotypeLikelihoodCalculatorDRAGEN[calculatorCachePloidyCapacity][calculatorCacheAlleleCapacity];
- calculators = new GenotypeLikelihoodCalculators();
this.computeBQD = useBQDModel;
this.computeFRD = useFRDModel;
this.allelePadding = allelePadding;
@@ -105,7 +97,6 @@ public GenotypingLikelihoods calculateLikelihoods(final Al
final int alleleCount = genotypingAlleles.numberOfAlleles();
final int variantOffset = data.readLikelihoods().getVariantCallingSubsetApplied().getStart() + allelePadding;
- GenotypeLikelihoodCalculatorDRAGEN likelihoodsCalculator = getLikelihoodsCalculator(ploidyModel.samplePloidy(0), alleleCount); //TODO this needs to change
for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) {
///////////////////////////////////////////////////////////////////////////
@@ -139,14 +130,9 @@ public GenotypingLikelihoods calculateLikelihoods(final Al
// Compute default likelihoods as normal (before we go ahead and alter the likelihoods for the call)
final int samplePloidy = ploidyModel.samplePloidy(sampleIndex);
- // get a new likelihoodsCalculator if this sample's ploidy differs from the previous sample's
- if (samplePloidy != likelihoodsCalculator.ploidy()) {
- likelihoodsCalculator = getLikelihoodsCalculator(samplePloidy, alleleCount);
- }
-
// this is the data array for the read likelihoods without any trouble
final LikelihoodMatrix sampleLikelihoods = alleleLikelihoodMatrixMapper.mapAlleles(data.readLikelihoods().sampleMatrix(sampleIndex));
- final double[] ploidyModelGenotypeLikelihoods = likelihoodsCalculator.rawGenotypeLikelihoods(sampleLikelihoods);
+ final double[] ploidyModelGenotypeLikelihoods = GenotypeLikelihoodCalculator.computeLog10GenotypeLikelihoods(samplePloidy, sampleLikelihoods);
if (HaplotypeCallerGenotypingDebugger.isEnabled()) {
HaplotypeCallerGenotypingDebugger.println("\n Standard Genotyping Likelihoods Results:");
@@ -155,14 +141,14 @@ public GenotypingLikelihoods calculateLikelihoods(final Al
if (computeBQD) {
applyLikelihoodsAdjusmentToBaseline(ploidyModelGenotypeLikelihoods, "BQD",
- likelihoodsCalculator.calculateBQDLikelihoods(sampleLikelihoods, strandForward, strandReverse,
- paddedReference, offsetForRefIntoEvent, calculators));
+ GenotypeLikelihoodCalculatorDRAGEN.calculateBQDLikelihoods(samplePloidy, sampleLikelihoods, strandForward, strandReverse,
+ paddedReference, offsetForRefIntoEvent));
}
if (computeFRD) {
applyLikelihoodsAdjusmentToBaseline(ploidyModelGenotypeLikelihoods, "FRD",
- likelihoodsCalculator.calculateFRDLikelihoods(sampleLikelihoods, ploidyModelGenotypeLikelihoods,
+ GenotypeLikelihoodCalculatorDRAGEN.calculateFRDLikelihoods(samplePloidy, sampleLikelihoods, ploidyModelGenotypeLikelihoods,
Stream.of(strandForward, strandReverse).flatMap(Collection::stream).collect(Collectors.toList()), // We filter out the HMM filtered reads as they do not apply to FRD
- FLAT_SNP_HET_PRIOR, api, maxEffectiveDepthAdjustment, calculators));
+ FLAT_SNP_HET_PRIOR, api, maxEffectiveDepthAdjustment));
}
// this is what the work actually is, after we have computed a few things
@@ -187,20 +173,6 @@ private void applyLikelihoodsAdjusmentToBaseline(final double[] initialLikelihoo
}
- private GenotypeLikelihoodCalculatorDRAGEN getLikelihoodsCalculator(final int samplePloidy, final int alleleCount) {
- if (samplePloidy >= cachePloidyCapacity || alleleCount >= cacheAlleleCountCapacity) {
- return calculators.getInstanceDRAGEN(samplePloidy, alleleCount);
- }
- final GenotypeLikelihoodCalculatorDRAGEN cachedResult = likelihoodCalculators[samplePloidy][alleleCount];
- if (cachedResult != null) {
- return cachedResult;
- } else {
- final GenotypeLikelihoodCalculatorDRAGEN newOne = calculators.getInstanceDRAGEN(samplePloidy, alleleCount);
- likelihoodCalculators[samplePloidy][alleleCount] = newOne;
- return newOne;
- }
- }
-
/**
* This helper class is used to store the necessary data in order to sort a read based on its BQD "feather end" as
* well as information relevant to re-associate the read with its position in the AlleleLikelihoods object arrays.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java
index aa485933da0..61a26fee7fc 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAlleleCounts.java
@@ -2,6 +2,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
+import org.apache.commons.math3.util.CombinatoricsUtils;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
@@ -10,6 +11,7 @@
import java.util.Arrays;
import java.util.Collections;
+import java.util.Iterator;
import java.util.List;
import java.util.function.IntConsumer;
import java.util.stream.Collectors;
@@ -18,6 +20,9 @@
/**
* Collection of allele counts for a genotype. It encompasses what alleles are present in the genotype and in what number.
*
+ * Also, it stores its index within the canonical ordering of genotypes and can efficiently generate the next genotype in that order, which is used
+ * to iterate over all genotypes of a given ploidy and allele count.
+ *
*
Alleles are represented herein by their indices running from 0 to N-1 where N is the number of alleles.
*
*
Genotypes are represented as a single array of alternating alleles and counts, where only alleles with non-zero counts are included:
@@ -52,18 +57,18 @@
*
2
0/1/1
*
3
1/1/1
*
4
0/0/2
- *
6
0/1/2
- *
7
1/1/2
- *
8
0/2/2
- *
9
1/2/2
- *
10
2/2/2
- *
11
0/0/3
- *
12
0/1/3
- *
13
1/1/3
- *
14
0/2/3
- *
15
1/2/3
- *
16
2/2/3
- *
17
0/3/3
+ *
5
0/1/2
+ *
6
1/1/2
+ *
7
0/2/2
+ *
8
1/2/2
+ *
9
2/2/2
+ *
10
0/0/3
+ *
11
0/1/3
+ *
12
1/1/3
+ *
13
0/2/3
+ *
14
1/2/3
+ *
15
2/2/3
+ *
16
0/3/3
*
...
...
*
*
@@ -79,7 +84,7 @@ public final class GenotypeAlleleCounts implements Comparable iterator(final int ploidy, final int alleleCount) {
+ return new Iterator() {
+ private int index = 0;
+ private int numGenotypes = GenotypeIndexCalculator.genotypeCount(ploidy, alleleCount);
+ private GenotypeAlleleCounts alleleCounts = first(ploidy);
+
+ @Override
+ public boolean hasNext() {
+ return index < numGenotypes;
+ }
+
+ @Override
+ public GenotypeAlleleCounts next() {
+ if (index++ > 0) {
+ alleleCounts.increase();
+ }
+ return alleleCounts;
+ }
+ };
+ }
+
+ /**
+ * Iterate over all GenotypeAlleleCounts for a given ploidy and allele count in the canonical order.
+ *
+ * This is the preferred way to access all GenotypeAlleleCounts in sequence, such as when computing genotype likelihoods.
+ * Thanks to the efficiency of the increase() method this iteration is extremely fast.
+ */
+ public static Iterable iterable(final int ploidy, final int alleleCount) {
+ return new Iterable() {
+ private final int p = ploidy;
+ private final int a = alleleCount;
+
+ @Override
+ public Iterator iterator() {
+ return GenotypeAlleleCounts.iterator(p,a);
+ }
+ };
+ }
+
/**
* Increases the allele counts a number of times.
*
*
* This method must not be invoked on cached genotype-allele-counts that are meant to remain constant,
- * such as the ones contained in {@link GenotypeLikelihoodCalculators#genotypeTableByPloidy}.
+ * such as the ones contained in {@link GenotypesCache#genotypeTableByPloidy}.
*
*
* @param times the number of times to increase.
@@ -162,17 +206,17 @@ protected void increase(final int times) {
}
/**
- * Updates the genotype counts to match the next genotype according to the canonical ordering of PLs.
+ * Returns the next genotype allele counts object in the canonical ordering of genotypes.
*
*
* This method must not be invoked on cached genotype-allele-counts that are meant to remain constant,
- * such as the ones contained in {@link GenotypeLikelihoodCalculators#genotypeTableByPloidy}
+ * such as the ones contained in {@link GenotypesCache#genotypeTableByPloidy}
*
*/
- protected void increase() {
+ protected GenotypeAlleleCounts increase() {
// if the ploidy is zero there is only one possible genotype.
if (distinctAlleleCount == 0) {
- return;
+ return this;
}
// Worth make this case faster.
@@ -239,6 +283,7 @@ protected void increase() {
}
index++;
log10CombinationCount = -1;
+ return this;
}
/**
@@ -303,17 +348,15 @@ public int distinctAlleleCount() {
}
/**
- * Gets the log10 combination count, computing it if uninitialized. Note that the invoked MathUtils method uses fast cached
- * log10 values of integers for any reasonable ploidy.
+ * Gets the log10 combination count, computing it if uninitialized.
*
- * This method should be invoked on instances of {@link GenotypeAlleleCounts} cached in {@link GenotypeLikelihoodCalculators::genotypeTableByPloidy}.
+ * This method should be invoked on instances of {@link GenotypeAlleleCounts} cached in {@link GenotypesCache}.
* Such usage allows the result of this computation to be cached once for an entire run of HaplotypeCaller.
- * @return
*/
public double log10CombinationCount() {
if (log10CombinationCount == UNCOMPUTED_LOG_10_COMBINATION_COUNT) {
- log10CombinationCount = MathUtils.log10Factorial(ploidy)
- - new IndexRange(0, distinctAlleleCount).sum(n -> MathUtils.log10Factorial(sortedAlleleCounts[2*n+1]));
+ log10CombinationCount = MathUtils.logToLog10(CombinatoricsUtils.factorialLog(ploidy)
+ - new IndexRange(0, distinctAlleleCount).sum(n -> CombinatoricsUtils.factorialLog(sortedAlleleCounts[2*n+1])));
}
return log10CombinationCount;
}
@@ -512,90 +555,6 @@ public int alleleCountFor(final int index) {
return rank < 0 ? 0 : alleleCountAt(rank);
}
- /**
- * Returns the allele counts for each allele index to maximum.
- * @param maximumAlleleIndex the maximum allele index required.
- * @throws IllegalArgumentException if {@code maximumAlleleIndex} is less than 0.
- * @return never {@code null}, an array of exactly {@code maximumAlleleIndex + 1} positions with the counts
- * of each allele where the position in the array is equal to its index.
- */
- public int[] alleleCountsByIndex(final int maximumAlleleIndex) {
- Utils.validateArg(maximumAlleleIndex >= 0, "the requested allele count cannot be less than 0");
- final int[] result = new int[maximumAlleleIndex + 1];
- copyAlleleCountsByIndex(result, 0, 0, maximumAlleleIndex);
- return result;
- }
-
-
- private void copyAlleleCountsByIndex(final int[] dest, final int offset, final int minimumAlleleIndex, final int maximumAlleleIndex) {
-
- // First we determine what section of the sortedAlleleCounts array contains the counts of interest,
- // By the present allele rank range of interest.
- final int minimumAlleleRank = alleleRankFor(minimumAlleleIndex);
- final int maximumAlleleRank = alleleRankFor(maximumAlleleIndex);
-
- // If the min or max allele index are absent (returned rank < 0) we note where the would be inserted; that
- // way we avoid going through the rest of positions in the sortedAlleleCounts array.
- // The range of interest is then [startRank,endRank].
- final int startRank = minimumAlleleRank < 0 ? - minimumAlleleRank - 1 : minimumAlleleRank;
- final int endRank = maximumAlleleRank < 0 ? - maximumAlleleRank - 2 : maximumAlleleRank;
-
- // Iteration variables:
- int nextIndex = minimumAlleleIndex; // next index that we want to output the count for.
- int nextRank = startRank; // next rank to query in sortedAlleleCounts.
- int nextSortedAlleleCountsOffset = nextRank << 1; // offset in sortedAlleleCounts where the info is present for the next rank.
- int nextDestOffset = offset; // next offset in destination array where to set the count for the nextIndex.
-
- while (nextRank++ <= endRank) {
- final int alleleIndex = sortedAlleleCounts[nextSortedAlleleCountsOffset++];
- // fill non-present allele counts with 0s.
- while (alleleIndex > nextIndex) {
- dest[nextDestOffset++] = 0;
- nextIndex++;
- }
- // It is guaranteed that at this point alleleIndex == nextIndex
- // thanks to the condition of the enclosing while: there must be at least one index of interest that
- // is present in the remaining (nextRank,endRank] interval as otherwise endRank would be less than nextRank.
- dest[nextDestOffset++] = sortedAlleleCounts[nextSortedAlleleCountsOffset++];
- nextIndex++;
- }
- // Finally we take care of trailing requested allele indices.
- while (nextIndex++ <= maximumAlleleIndex) {
- dest[nextDestOffset++] = 0;
- }
- }
-
- /**
- * Copies the sorted allele counts into an array.
- *
- *
- * Sorted allele counts are disposed as an even-sized array where even positions indicate the allele index and
- * the following odd positions the number of copies of that allele in this genotype allele count:
- *
- * With {@code offset} you can indicate an alternative first position in the destination array.
- *
- *
- * @param dest where to copy the counts.
- * @param offset starting position.
- *
- * @throws IllegalArgumentException if {@code dest} is {@code null}, {@code offset} is less than 0
- * or {@code dest} is not large enough considering the number of alleles present in this genotype
- * allele counts and the {@code offset} provided. A total of
- * {@link #distinctAlleleCount()} * 2 positions
- * are required for the job.
- */
- public void copyAlleleCounts(final int[] dest, final int offset) {
- Utils.nonNull(dest, "the destination cannot be null");
- Utils.validateArg(offset >= 0, "the offset cannot be negative");
- final int sortedAlleleCountsLength = distinctAlleleCount << 1;
- Utils.validateArg(offset + sortedAlleleCountsLength <= dest.length, "the input array does not have enough capacity");
- System.arraycopy(sortedAlleleCounts, 0, dest, offset, sortedAlleleCountsLength);
- }
/**
* Instantiates the first genotype possible provided a total ploidy.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeIndexCalculator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeIndexCalculator.java
new file mode 100644
index 00000000000..07845ef52ab
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeIndexCalculator.java
@@ -0,0 +1,206 @@
+package org.broadinstitute.hellbender.tools.walkers.genotyper;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.math3.util.CombinatoricsUtils;
+import org.apache.commons.math3.util.FastMath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.IndexRange;
+import org.broadinstitute.hellbender.utils.MathUtils;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.Arrays;
+
+/**
+ * Utilities class for calculations involving the canonical enumeration of (unphased) genotypes.
+ *
+ * For diploid genotypes with alleles A, B, C. . . this ordering is AA, AB, BB, AC, BC, CC. . .
+ *
+ * For triploid genotypes it is AAA, AAB, ABB, BBB, AAC, ABC, BBC, ACC, BCC, CCC. . .
+ *
+ * Note that we may define the ordering recursively. Letting g = {g_1,g_2,g_3, . . ., g_N} and h = {h_1,h_2,h_3, . . ., h_N}
+ * be genotypes comprising alleles g_1,g_2,g_3, . . . , g_N and h_1,h_2,h_3, . . ., h_N, respectively:
+ * (i) the order of haploid genotypes is simply the allele ordering
+ * (ii) if g_N < h_N then g < h
+ * (iii) if g_N = h_N then the order is that of the first N-1 alleles
+ *
+ * Note also that whenever possible it is best to traverse all genotypes in the canonical order without the random index calculations
+ * provided here. However, when subsetting, reordering, merging, and adding alleles it is necessary to translate indices from
+ * one basis of alleles to another. In such cases efficient index calculations are important.
+ */
+public class GenotypeIndexCalculator {
+
+ private GenotypeIndexCalculator() {}
+
+ /**
+ * How many genotypes with given ploidy appear in the standard order before a given allele is reached.
+ *
+ * For example, considering alleles A, B, C, D, etc ... (indexed 0, 1, 2, ... respectively):
+ * f(3,A) = f(3,0) = 0 as the first genotype AAA contains A.
+ * f(3,B) = f(3,1) = 1 as the second genotype AAB contains B.
+ * f(3,C) = f(3,2) = 4 as the first genotype that contains C, AAC follows: AAA AAB ABB BBB
+ * f(4,D) = f(4,3) = 15 as AAAD follows AAAA AAAB AABB ABBB BBBB AAAC AABC ABBC BBBC AACC ABCC BBCC ACCC BCCC CCCC
+ *
+ * There is a simple closed-form expression for this. Any genotype with ploidy p and a alleles can be encoded
+ * by p 'x's and a - 1 '/'s, where each x represents one allele count and each slash divides between consecutive alleles.
+ * For example, with p = 3 and a = 3 we have xxx// representing AAA, //xxx representing CCC, x/x/x representing ABC,
+ * and xx//x representing AAC. It is easy to see that any such string corresponds to a genotype, and the number of such
+ * strings is given by the number of places to put the a-1 slashes within the p+a-1 total characters, which is
+ * simply the binomial coefficient (p+a-1)C(a-1). Considering that allele indices are zero-based, we also have
+ * f(p,a) = (p+a-1)C(a-1).
+ *
+ * See discussion at https://genome.sph.umich.edu/wiki/Relationship_between_Ploidy,_Alleles_and_Genotypes
+ */
+ public static long indexOfFirstGenotypeWithAllele(final int ploidy, final int allele) {
+ return allele == 0 ? 0 : CombinatoricsUtils.binomialCoefficient(ploidy + allele - 1, allele - 1);
+ }
+
+ /**
+ * Returns the number of possible genotypes given the ploidy and number of different alleles.
+ * @param ploidy the requested ploidy.
+ * @param alleleCount the requested number of alleles.
+ *
+ * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative or
+ * the number of genotypes is too large (more than {@link Integer#MAX_VALUE}).
+ *
+ * @return the number of genotypes given ploidy and allele count (0 or greater).
+ */
+ public static int genotypeCount(final int ploidy, final int alleleCount) {
+ final long result = indexOfFirstGenotypeWithAllele(ploidy, alleleCount);
+ Utils.validateArg(result != MathUtils.LONG_OVERFLOW && result < Integer.MAX_VALUE, () ->
+ String.format("the number of genotypes is too large for ploidy %d and %d alleles: approx. %.0f", ploidy, alleleCount,
+ CombinatoricsUtils.binomialCoefficientDouble(ploidy + alleleCount - 1, alleleCount - 1)));
+ return (int) result;
+ }
+
+ /**
+ * Give a list of alleles, returns the likelihood array index.
+ *
+ * @param alleles the indices of the alleles in the genotype, there should be as many repetition of an
+ * index as copies of that allele in the genotype. Allele indices do not need to be sorted in
+ * any particular way. For example, {A,A,B}, {A,B,A}, {B,A,A} are all valid inputs.
+ *
+ * @return never {@code null}.
+ */
+ public static int allelesToIndex(final int... alleles) {
+ final int ploidy = alleles.length;
+ return ploidy == 0 ? 0 : calculateIndex(Arrays.copyOf(alleles, ploidy));
+ }
+
+ /**
+ * Returns the genotype index given the allele counts in format (allele1, count1, allele2, count2. . . )
+ *
+ * @param alleleCountArray the query allele counts.
+ *
+ * @throws IllegalArgumentException if {@code alleleCountArray} is null, has odd length, contains negative counts,
+ * or has a total allele count different from the ploidy.
+ */
+ public static int alleleCountsToIndex(final int ... alleleCountArray) {
+ Utils.nonNull(alleleCountArray, "the allele counts cannot be null");
+ Utils.validateArg((alleleCountArray.length & 1) == 0, "the allele counts array cannot have odd length");
+ int ploidy = 0;
+ for (int i = 0; i < alleleCountArray.length; i += 2) {
+ ploidy += alleleCountArray[i+1];
+ }
+ final int[] alleleContainer = new int[ploidy];
+
+
+ int n = 0;
+ for (int i = 0; i < alleleCountArray.length; i += 2) {
+ final int allele = alleleCountArray[i];
+ final int count = alleleCountArray[i+1];
+ Utils.validateArg(count >= 0, "no allele count can be less than 0");
+ for (int j = 0; j < count; j++, n++) {
+ alleleContainer[n] = allele;
+ }
+ }
+ return calculateIndex(alleleContainer);
+ }
+
+ /**
+ * Calculate the "old" genotype index for the ploidy and allele count of this instance given a GenotypeAlleleCounts
+ * object in some new basis of alleles and a int -> int map (in the form of an array) to translate from new allele
+ * indices to the "old" allele indices of this instance.
+ */
+ public static int alleleCountsToIndex(final GenotypeAlleleCounts newGAC, final int[] newToOldAlleleMap) {
+ final int[] alleleContainer = new int[newGAC.ploidy()];
+ final MutableInt n = new MutableInt(0);
+ newGAC.forEachAlleleIndexAndCount((newAllele, count) -> {
+ final int oldAllele = newToOldAlleleMap[newAllele];
+ new IndexRange(0, count).forEach(k -> alleleContainer[n.getAndIncrement()] = oldAllele);
+ });
+
+ return calculateIndex(alleleContainer);
+ }
+
+ /**
+ * Example: suppose our genotype is ABC. Then the index is the sum of (1) the number of ploidy 3 genotypes before
+ * reaching C in the third position, (2) the number of ploidy 2 genotypes before reaching B in the 2nd position, and
+ * (3) the number of ploidy 1 genotypes before reaching A in the 1st position.
+ */
+ private static int calculateIndex(final int[] alleles) {
+ final int ploidy = alleles.length;
+
+ // traverse alleles from highest to lowest index
+ Arrays.sort(alleles);
+ return new IndexRange(0, ploidy).sumInt(n -> {
+ final int allele = alleles[ploidy - n - 1];
+ return (int) indexOfFirstGenotypeWithAllele(ploidy - n, allele);
+ });
+ }
+
+ /**
+ * Compute the maximally acceptable allele count (ref allele included) given the maximally acceptable genotype count.
+ * @param ploidy sample ploidy
+ * @param maxGenotypeCount maximum number of genotype count used to calculate upper bound on number of alleles given ploidy
+ * @throws IllegalArgumentException if {@code ploidy} or {@code alleleCount} is negative.
+ * @return the maximally acceptable allele count given ploidy and maximum number of genotypes acceptable
+ */
+ public static int computeMaxAcceptableAlleleCount(final int ploidy, final int maxGenotypeCount){
+ Utils.validateArg(ploidy >= 0, () -> "negative ploidy " + ploidy);
+
+ if (ploidy == 1) {
+ return maxGenotypeCount;
+ }
+ final double logMaxGenotypeCount = Math.log(maxGenotypeCount);
+
+ // Math explanation: genotype count is determined by ${P+A-1 \choose A-1}$, this leads to constraint
+ // $\log(\frac{(P+A-1)!}{(A-1)!}) \le \log(P!G)$,
+ // where $P$ is ploidy, $A$ is allele count, and $G$ is maxGenotypeCount
+ // The upper and lower bounds of the left hand side of the constraint are $P \log(A-1+P)$ and $P \log(A)$
+ // which require $A$ to be searched in interval $[exp{\log(P!G)/P} - (P-1), exp{\log(P!G)/P}]$
+ // Denote $[10^{\log(P!G)/P}$ as $x$ in the code.
+
+ final double x = FastMath.exp((CombinatoricsUtils.factorialLog(ploidy) + logMaxGenotypeCount)/ploidy );
+ final int lower = (int)Math.floor(x) - ploidy - 1;
+ final int upper = (int)Math.ceil(x);
+ for(int a=upper; a>=lower; --a){// check one by one
+
+ final double logGTCnt = CombinatoricsUtils.binomialCoefficientLog(ploidy+a-1, a-1);
+ if(logMaxGenotypeCount >= logGTCnt) {
+ return a;
+ }
+ }
+ throw new GATKException.ShouldNeverReachHereException("This method must have implemented its search wrong.");
+ }
+
+ /**
+ * Composes a genotype index map given a allele index recoding such that result[i] is the index of the old
+ * genotype corresponding to the ith new genotype.
+ *
+ * @param newToOldAlleleMap allele recoding such that newToOldAlleleMap[i] is the index of the old allele
+ * corresponding to the ith new allele
+ *
+ * @return never {@code null}.
+ */
+ public static int[] newToOldGenotypeMap(final int ploidy, final int[] newToOldAlleleMap) {
+ Utils.nonNull(newToOldAlleleMap);
+ final int newAlleleCount = newToOldAlleleMap.length;
+
+ final int[] result = new int[genotypeCount(ploidy, newAlleleCount)];
+ for (final GenotypeAlleleCounts newGAC : GenotypeAlleleCounts.iterable(ploidy, newAlleleCount)) {
+ result[newGAC.index()] = alleleCountsToIndex(newGAC, newToOldAlleleMap);
+ }
+
+ return result;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java
index 66c1fd241a1..d1a99280019 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeLikelihoodCalculator.java
@@ -2,550 +2,151 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.util.FastMath;
+import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
-import java.util.Comparator;
-import java.util.PriorityQueue;
-
+import java.util.Arrays;
+
+/**
+ * This class has a single responsibility: calculating genotype likelihoods given allele likelihoods through the formula:
+ *
+ * Prob(reads | genotype) = product_{all reads} [[sum_{alleles in genotype} Prob(read | allele)]/ploidy]
+ *
+ * Note that this applies to non-somatic variant calling, where ploidy is a known integer and genotypes are given by the
+ * number of copies of each allele.
+ *
+ * COMPUTATIONAL NOTE
+ * In the multiallelic calculation we accumulate the likelihood contribution of each read one allele at a time. That is,
+ * for genotype containing alleles A, B, C, we first fill an array with the likelihood contributions from allele A, then
+ * we make a second pass and add the contributions from allele B, then allele C. Traversing all the reads in each
+ * allele row of the likelihoods array in this manner is cache-friendly and makes an enormous difference in runtime.
+ *
+ * The difference in performance comes from the fact that we index likelihoods first by allele, then by read. Because of this,
+ * likelihoods of consecutive reads with the same allele are adjacent in memory while likelihoods of consecutive alleles with the same read
+ * are not. In the former case looking up new likelihoods almost always results in a cache hit since many reads of the same allele
+ * are loaded on the same cache page.
+ *
+ * If the cache-friendliness of this class is broken, it will show up as a severe regression in the runtime of its unit tests
+ * for larger ploidies and allele counts.
+ */
public class GenotypeLikelihoodCalculator {
- /**
- * Offset table for this calculator.
- *
- *
- * This is a shallow copy of {@link GenotypeLikelihoodCalculators#alleleFirstGenotypeOffsetByPloidy} when the calculator was created
- * thus it follows the same format as that array. Please refer to its documentation.
- *
- *
- *
You can assume that this offset table contain at least (probably more) the numbers corresponding to the allele count and ploidy for this calculator.
- * However since it might have more than that and so you must use {@link #alleleCount} and {@link #ploidy} when
- * iterating through this array rather that its length or the length of its components.
.
- */
- private final int[][] alleleFirstGenotypeOffsetByPloidy;
- /**
- * Genotype table for this calculator.
- *
- *
It is ensure that it contains all the genotypes for this calculator ploidy and allele count, maybe more. For
- * that reason you must use {@link #genotypeCount} when iterating through this array and not relay on its length.
- */
- private final GenotypeAlleleCounts[] genotypeAlleleCounts;
- /**
- * Number of genotypes given this calculator {@link #ploidy} and {@link #alleleCount}.
- */
- final int genotypeCount;
- /**
- * Number of genotyping alleles for this calculator.
- */
- final int alleleCount;
- /**
- * Ploidy for this calculator.
- */
- final int ploidy;
- /**
- * Max-heap for integers used for this calculator internally.
- */
- private final PriorityQueue alleleHeap;
- /**
- * Buffer used as a temporary container for likelihood components for genotypes stratified by reads.
- *
- *
- * It is indexed by genotype index and then by read index. The read capacity is increased as needed by calling
- * {@link #ensureReadCapacity(int) ensureReadCapacity}.
- *
- */
- final double[][] readLikelihoodsByGenotypeIndex;
- /**
- * Buffer field use as a temporal container for sorted allele counts when calculating the likelihood of a
- * read in a genotype.
- *
- * This array follows the same format as {@link GenotypeAlleleCounts#sortedAlleleCounts}. Each component in the
- * genotype takes up two positions in the array where the first indicate the allele index and the second its frequency in the
- * genotype. Only non-zero frequency alleles are represented, taking up the first positions of the array.
- *
- *
- *
- * This array is sized so that it can accommodate the maximum possible number of distinct alleles in any
- * genotype supported by the calculator, value stored in {@link #maximumDistinctAllelesInGenotype}.
- *
- */
- private final int[] genotypeAllelesAndCounts;
- /**
- * Maximum number of components (or distinct alleles) for any genotype with this calculator ploidy and allele count.
- */
- private int maximumDistinctAllelesInGenotype;
- /**
- * Cache of the last genotype-allele-count requested using {@link #genotypeAlleleCountsAt(int)}, when it
- * goes beyond the maximum genotype-allele-count static capacity. Check on that method documentation for details.
- */
- private GenotypeAlleleCounts lastOverheadCounts;
- /**
- * Buffer used as a temporary container for likelihood components for genotypes stratified by alleles, allele frequency and reads.
- *
- *
To improve performance we use a 1-dimensional array to implement a 3-dimensional one as some of those dimension
- * have typically very low depths (allele and allele frequency)
- *
- *
- * The value contained in position [a][f][r] == log10Lk(read[r] | allele[a]) + log10(f) . Exception is
- * for f == 0 whose value is undefined (in practice 0.0) and never used.
- *
- *
- *
- * It is indexed by read, then by allele and then by the number of copies of the allele. For the latter
- * there are as many entries as the ploidy of the calculator + 1 (to accommodate zero copies although is
- * never used in practice).
- *
- */
- double[] readAlleleLikelihoodByAlleleCount = null;
- /**
- * Indicates how many reads the calculator supports.
- *
- *
This figure is increased dynamically as per the
- * calculation request calling {@link #ensureReadCapacity(int) ensureReadCapacity}.
- */
- private int readCapacity = -1;
- /**
- * Buffer field use as a temporal container for component likelihoods when calculating the likelihood of a
- * read in a genotype. It is stratified by read and the allele component of the genotype likelihood... that is
- * the part of the likelihood sum that correspond to a particular allele in the genotype.
- *
- *
- * It is implemented in a 1-dimensional array since typically one of the dimensions is rather small. Its size
- * is equal to {@link #readCapacity} times {@link #maximumDistinctAllelesInGenotype}.
- *
- *
- *
- * More concretely [r][i] == log10Lk(read[r] | allele[i]) + log(freq[i]) where allele[i] is the ith allele
- * in the genotype of interest and freq[i] is the number of times it occurs in that genotype.
- *
- */
- private double[] readGenotypeLikelihoodComponents;
- public GenotypeLikelihoodCalculator(final int ploidy, final int alleleCount, final int[][] alleleFirstGenotypeOffsetByPloidy,
- final GenotypeAlleleCounts[][] genotypeTableByPloidy) {
- maximumDistinctAllelesInGenotype = Math.min(ploidy, alleleCount);
- this.alleleFirstGenotypeOffsetByPloidy = alleleFirstGenotypeOffsetByPloidy;
- genotypeAlleleCounts = genotypeTableByPloidy[ploidy];
- genotypeCount = this.alleleFirstGenotypeOffsetByPloidy[ploidy][alleleCount];
- this.alleleCount = alleleCount;
- this.ploidy = ploidy;
- alleleHeap = new PriorityQueue<>(ploidy, Comparator.naturalOrder().reversed());
- readLikelihoodsByGenotypeIndex = new double[genotypeCount][];
- genotypeAllelesAndCounts = new int[maximumDistinctAllelesInGenotype * 2];
- }
+ protected GenotypeLikelihoodCalculator() { }
/**
- * Makes sure that temporal arrays and matrices are prepared for a number of reads to process.
- * @param requestedCapacity number of read that need to be processed.
- */
- public void ensureReadCapacity(final int requestedCapacity) {
- Utils.validateArg(requestedCapacity >= 0, "capacity may not be negative");
- if (readCapacity == -1) { // first time call.
- final int minimumCapacity = Math.max(requestedCapacity, 10); // Never go too small, 10 is the minimum.
- readAlleleLikelihoodByAlleleCount = new double[minimumCapacity * alleleCount * (ploidy+1)];
- for (int i = 0; i < genotypeCount; i++) {
- readLikelihoodsByGenotypeIndex[i] = new double[minimumCapacity];
- }
- readGenotypeLikelihoodComponents = new double[ploidy * minimumCapacity];
- readCapacity = minimumCapacity;
- } else if (readCapacity < requestedCapacity) {
- final int doubleCapacity = (requestedCapacity << 1);
- readAlleleLikelihoodByAlleleCount = new double[doubleCapacity * alleleCount * (ploidy+1)];
- for (int i = 0; i < genotypeCount; i++) {
- readLikelihoodsByGenotypeIndex[i] = new double[doubleCapacity];
- }
- readGenotypeLikelihoodComponents = new double[maximumDistinctAllelesInGenotype * doubleCapacity];
- readCapacity = doubleCapacity;
- }
- }
-
- /**
- * Give a list of alleles, returns the likelihood array index.
- * @param alleleIndices the indices of the alleles in the genotype, there should be as many repetition of an
- * index as copies of that allele in the genotype. Allele indices do not need to be sorted in
- * any particular way.
- *
- * @return never {@code null}.
- */
- public int allelesToIndex(final int... alleleIndices) {
- // Special case ploidy == 0.
- if (ploidy == 0) {
- return 0;
- }
-
- alleleHeap.clear();
- for (int i = 0; i < alleleIndices.length; i++) {
- alleleHeap.add(alleleIndices[i]);
- }
- return alleleHeapToIndex();
- }
-
- /**
- * Returns the number of possible genotypes given ploidy and the maximum allele index.
- * @return never {@code null}.
- */
- public int genotypeCount() {
- return genotypeCount;
- }
-
- /**
- * Returns the genotype associated to a particular likelihood index.
- *
- *
If {@code index} is larger than {@link GenotypeLikelihoodCalculators#MAXIMUM_STRONG_REF_GENOTYPE_PER_PLOIDY},
- * this method will reconstruct that genotype-allele-count iteratively from the largest strongly referenced count available.
- * or the last requested index genotype.
- *
+ * Calculate the log10AlleleLikelihoods given the list of alleles and the likelihood map.
*
- *
Therefore if you are iterating through all genotype-allele-counts you should do sequentially and incrementally, to
- * avoid a large efficiency drop
.
- *
- * @param index query likelihood-index.
- * @return never {@code null}.
- */
- public GenotypeAlleleCounts genotypeAlleleCountsAt(final int index) {
- Utils.validateArg(index >= 0 && index < genotypeCount, () -> "invalid likelihood index: " + index + " >= " + genotypeCount
- + " (genotype count for nalleles = " + alleleCount + " and ploidy " + ploidy);
- if (index < GenotypeLikelihoodCalculators.MAXIMUM_STRONG_REF_GENOTYPE_PER_PLOIDY) {
- return genotypeAlleleCounts[index];
- } else if (lastOverheadCounts == null || lastOverheadCounts.index() > index) {
- final GenotypeAlleleCounts result = genotypeAlleleCounts[GenotypeLikelihoodCalculators.MAXIMUM_STRONG_REF_GENOTYPE_PER_PLOIDY - 1].copy();
- result.increase(index - GenotypeLikelihoodCalculators.MAXIMUM_STRONG_REF_GENOTYPE_PER_PLOIDY + 1);
- lastOverheadCounts = result;
- return result.copy();
- } else {
- lastOverheadCounts.increase(index - lastOverheadCounts.index());
- return lastOverheadCounts.copy();
- }
- }
-
- /**
- * Calculate the likelihoods given the list of alleles and the likelihood map.
+ * @param log10AlleleLikelihoods the likelihood matrix all alleles vs all reads.
*
- * @param likelihoods the likelihood matrix all alleles vs all reads.
- *
- * @throws IllegalArgumentException if {@code alleleList} is {@code null} or {@code likelihoods} is {@code null}
+ * @throws IllegalArgumentException if {@code alleleList} is {@code null} or {@code log10AlleleLikelihoods} is {@code null}
* or the alleleList size does not match the allele-count of this calculator, or there are missing allele vs
- * read combinations in {@code likelihoods}.
- *
- * @return never {@code null}.
- */
- public GenotypeLikelihoods genotypeLikelihoods(final LikelihoodMatrix likelihoods) {
- final double[] readLikelihoodsByGenotypeIndex = getReadRawReadLikelihoodsByGenotypeIndex(likelihoods);
- return GenotypeLikelihoods.fromLog10Likelihoods(readLikelihoodsByGenotypeIndex);
- }
-
- /**
- * A helper method that actually does the matrix operations but returns the raw values.
- *
- * @return the raw array (in log10 likelihoods space) of the GL for each genotype
- */
- double[] getReadRawReadLikelihoodsByGenotypeIndex(final LikelihoodMatrix likelihoods) {
- Utils.nonNull(likelihoods);
- Utils.validateArg(likelihoods.numberOfAlleles() == alleleCount, "mismatch between allele list and alleleCount");
- final int readCount = likelihoods.evidenceCount();
- ensureReadCapacity(readCount);
-
- /// [x][y][z] = z * LnLk(Read_x | Allele_y)
- final double[] readLikelihoodComponentsByAlleleCount
- = readLikelihoodComponentsByAlleleCount(likelihoods);
- final double[][] genotypeLikelihoodByRead = genotypeLikelihoodByRead(readLikelihoodComponentsByAlleleCount,readCount);
- return genotypeLikelihoods(genotypeLikelihoodByRead, readCount);
- }
-
- /**
- * Calculates the final genotype likelihood array out of the likelihoods for each genotype per read.
- *
- * @param readLikelihoodsByGenotypeIndex [g][r] likelihoods for each genotype g and r.
- * @param readCount number of reads in the input likelihood arrays in {@code genotypeLikelihoodByRead}.
- * @return never {@code null}, one position per genotype where the i entry is the likelihood of the ith
- * genotype (0-based).
- */
- double[] genotypeLikelihoods(final double[][] readLikelihoodsByGenotypeIndex, final int readCount) {
- final double[] result = new double[genotypeCount];
- final double denominator = readCount * MathUtils.log10(ploidy);
- // instead of dividing each read likelihood by ploidy ( so subtract log10(ploidy) )
- // we multiply them all and the divide by ploidy^readCount (so substract readCount * log10(ploidy) )
- for (int g = 0; g < genotypeCount; g++) {
- result[g] = MathUtils.sum(readLikelihoodsByGenotypeIndex[g], 0, readCount) - denominator;
- }
- return result;
- }
-
- /**
- * Calculates the likelihood component of each read on each genotype.
- *
- * NOTE: this is not actually the read likelihood component for each genotype, it is the sum of the log read likelihoods components
- * for each genotype without having been normalized by the the denominator of the ploidy, that happens in the final step
+ * read combinations in {@code log10AlleleLikelihoods}.
*
- * @param readLikelihoodComponentsByAlleleCount [a][f][r] likelihood stratified by allele a, frequency in genotype f and
- * read r.
- * @param readCount number of reads in {@code readLikelihoodComponentsByAlleleCount}.
* @return never {@code null}.
*/
- protected double[][] genotypeLikelihoodByRead(final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
-
- // Here we don't use the convenience of {@link #genotypeAlleleCountsAt(int)} within the loop to spare instantiations of
- // GenotypeAlleleCounts class when we are dealing with many genotypes.
- GenotypeAlleleCounts alleleCounts = genotypeAlleleCounts[0];
-
- for (int genotypeIndex = 0; genotypeIndex < genotypeCount; genotypeIndex++) {
- final double[] readLikelihoods = this.readLikelihoodsByGenotypeIndex[genotypeIndex];
- final int componentCount = alleleCounts.distinctAlleleCount();
- switch (componentCount) {
- case 1: //
- singleComponentGenotypeLikelihoodByRead(alleleCounts, readLikelihoods, readLikelihoodComponentsByAlleleCount, readCount);
- break;
- case 2:
- twoComponentGenotypeLikelihoodByRead(alleleCounts,readLikelihoods,readLikelihoodComponentsByAlleleCount, readCount);
- break;
- default:
- manyComponentGenotypeLikelihoodByRead(alleleCounts,readLikelihoods,readLikelihoodComponentsByAlleleCount, readCount);
- }
- if (genotypeIndex < genotypeCount - 1) {
- alleleCounts = nextGenotypeAlleleCounts(alleleCounts);
+ public static GenotypeLikelihoods log10GenotypeLikelihoods(final int ploidy, final LikelihoodMatrix log10AlleleLikelihoods) {
+ final double[] log10GenotypeLikelihoods = computeLog10GenotypeLikelihoods(ploidy, log10AlleleLikelihoods);
+ return GenotypeLikelihoods.fromLog10Likelihoods(log10GenotypeLikelihoods);
+ }
+
+ /**
+ * Compute the genotype log10 likelihoods as an array in the canonical genotype order. That is, result[i] = Pr(reads | ith genotype)
+ *
+ * @param log10AlleleLikelihoods log 10 likelihood matrix indexed by allele, then read
+ * @return the log 10 likelihood of each genotype as an array
+ */
+ protected static double[] computeLog10GenotypeLikelihoods(final int ploidy, final LikelihoodMatrix log10AlleleLikelihoods) {
+ Utils.nonNull(log10AlleleLikelihoods);
+ final int alleleCount = log10AlleleLikelihoods.numberOfAlleles();
+ final int readCount = log10AlleleLikelihoods.evidenceCount();
+
+ final double[][] log10LikelihoodsByAlleleAndRead = log10AlleleLikelihoods.asRealMatrix().getData();
+
+ final boolean triallelicGenotypesPossible = alleleCount > 2 && ploidy > 2;
+ final double[] perReadBuffer = triallelicGenotypesPossible ? new double[readCount] : null;
+
+ // non-log space log10AlleleLikelihoods for multiallelic computation requires rescaling for stability when we
+ // exponentiate away the log, and we store the scaling factor to bring back later
+ final Pair rescaledNonLogLikelihoodsAndCorrection = !triallelicGenotypesPossible ? null :
+ rescaledNonLogLikelihoods(log10AlleleLikelihoods);
+
+ final double[] result = new double[GenotypeIndexCalculator.genotypeCount(ploidy, alleleCount)];
+
+ for (final GenotypeAlleleCounts gac : GenotypeAlleleCounts.iterable(ploidy, alleleCount)) {
+ final int componentCount = gac.distinctAlleleCount();
+ final int genotypeIndex = gac.index();
+ if (componentCount == 1) {
+ // homozygous case: log P(reads|AAAAA. . .) = sum_{reads} log P(read|A)
+ final int allele = gac.alleleIndexAt(0);
+ result[genotypeIndex] = MathUtils.sum(log10LikelihoodsByAlleleAndRead[allele]);
+ } else if (componentCount == 2) {
+ // biallelic het case: log P(reads | nA copies of A, nB copies of B) = sum_{reads} (log[(nA * P(read | A) + nB * P(read | B))] -log(ploidy))
+ final double[] log10ReadLks1 = log10LikelihoodsByAlleleAndRead[gac.alleleIndexAt(0)];
+ final int count1 = gac.alleleCountAt(0);
+ final double log10Count1 = Math.log10(count1);
+ final double[] log10ReadLks2 = log10LikelihoodsByAlleleAndRead[gac.alleleIndexAt(1)];
+ final double log10Count2 = Math.log10(ploidy - count1);
+
+ // note: if you are reading the multiallelic case below and have gotten paranoid about cache efficiency,
+ // here the log10 likelihood matrix rows for *both* alleles are in the cache at once
+ result[genotypeIndex] = new IndexRange(0, readCount).sum(r -> MathUtils.approximateLog10SumLog10(log10ReadLks1[r] + log10Count1, log10ReadLks2[r] + log10Count2))
+ - readCount * Math.log10(ploidy);
+ } else {
+ // the multiallelic case is conceptually the same as the biallelic case but done in non-log space
+ // We implement in a cache-friendly way by summing nA * P(read|A) over all alleles for each read, but iterating over reads as the inner loop
+ Arrays.fill(perReadBuffer,0, readCount, 0);
+ final double[][] rescaledNonLogLikelihoods = rescaledNonLogLikelihoodsAndCorrection.getLeft();
+ final double log10Rescaling = rescaledNonLogLikelihoodsAndCorrection.getRight();
+ gac.forEachAlleleIndexAndCount((a, f) -> new IndexRange(0, readCount).forEach(r -> perReadBuffer[r] += f * rescaledNonLogLikelihoods[a][r]));
+ result[genotypeIndex] = new IndexRange(0, readCount).sum(r -> Math.log10(perReadBuffer[r])) - readCount * Math.log10(ploidy) + log10Rescaling;
}
}
- return readLikelihoodsByGenotypeIndex;
- }
-
- private GenotypeAlleleCounts nextGenotypeAlleleCounts(final GenotypeAlleleCounts alleleCounts) {
- final int index = alleleCounts.index();
- final GenotypeAlleleCounts result;
- final int cmp = index - GenotypeLikelihoodCalculators.MAXIMUM_STRONG_REF_GENOTYPE_PER_PLOIDY + 1;
- if (cmp < 0) {
- result = genotypeAlleleCounts[index + 1];
- } else if (cmp == 0) {
- result = genotypeAlleleCounts[index].copy();
- result.increase();
- } else {
- alleleCounts.increase();
- result = alleleCounts;
- }
return result;
}
- /**
- * General genotype likelihood component by read calculator. It does not make any assumption in the exact
- * number of alleles present in the genotype.
- */
- private void manyComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
- final double[] likelihoodByRead,
- final double[]readLikelihoodComponentsByAlleleCount,
- final int readCount) {
-
- // First we collect the allele likelihood component for all reads and place it
- // in readGenotypeLikelihoodComponents for the final calculation per read.
- genotypeAlleleCounts.copyAlleleCounts(genotypeAllelesAndCounts,0);
- final int componentCount = genotypeAlleleCounts.distinctAlleleCount();
- final int alleleDataSize = (ploidy + 1) * readCount;
- for (int c = 0,cc = 0; c < componentCount; c++) {
- final int alleleIndex = genotypeAllelesAndCounts[cc++];
- final int alleleCount = genotypeAllelesAndCounts[cc++];
- // alleleDataOffset will point to the index of the first read likelihood for that allele and allele count.
- int alleleDataOffset = alleleDataSize * alleleIndex + alleleCount * readCount;
- for (int r = 0, readDataOffset = c; r < readCount; r++, readDataOffset += maximumDistinctAllelesInGenotype) {
- readGenotypeLikelihoodComponents[readDataOffset] = readLikelihoodComponentsByAlleleCount[alleleDataOffset++];
- }
- }
-
- // Calculate the likelihood per read.
- for (int r = 0, readDataOffset = 0; r < readCount; r++, readDataOffset += maximumDistinctAllelesInGenotype) {
- likelihoodByRead[r] = MathUtils.approximateLog10SumLog10(readGenotypeLikelihoodComponents, readDataOffset, readDataOffset + componentCount);
- }
- }
/**
- * Calculates the likelihood component by read for a given genotype allele count assuming that there are
- * exactly two alleles present in the genotype (with arbitrary non-zero counts each).
- */
- void twoComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
- final double[] likelihoodByRead,
- final double[] readLikelihoodComponentsByAlleleCount,
- final int readCount) {
- final int allele0 = genotypeAlleleCounts.alleleIndexAt(0);
- final int freq0 = genotypeAlleleCounts.alleleCountAt(0);
- final int allele1 = genotypeAlleleCounts.alleleIndexAt(1);
- final int freq1 = ploidy - freq0; // no need to get it from genotypeAlleleCounts.
- int allele0LnLkOffset = readCount * ((ploidy + 1) * allele0 + freq0);
- int allele1LnLkOffset = readCount * ((ploidy + 1) * allele1 + freq1);
- for (int r = 0; r < readCount; r++) {
- final double lnLk0 = readLikelihoodComponentsByAlleleCount[allele0LnLkOffset++];
- final double lnLk1 = readLikelihoodComponentsByAlleleCount[allele1LnLkOffset++];
- likelihoodByRead[r] = MathUtils.approximateLog10SumLog10(lnLk0, lnLk1);
- }
- }
-
- /**
- * Calculates the likelihood component by read for a given genotype allele count assuming that there are
- * exactly one allele present in the genotype.
- */
- void singleComponentGenotypeLikelihoodByRead(final GenotypeAlleleCounts genotypeAlleleCounts,
- final double[] likelihoodByRead, final double[] readLikelihoodComponentsByAlleleCount, final int readCount) {
- final int allele = genotypeAlleleCounts.alleleIndexAt(0);
- // the count of the only component must be = ploidy.
- int offset = (allele * (ploidy + 1) + ploidy) * readCount;
- for (int r = 0; r < readCount; r++) {
- likelihoodByRead[r] =
- readLikelihoodComponentsByAlleleCount[offset++];
- }
- }
-
- /**
- * Returns a 3rd matrix with the likelihood components.
- *
- *
- *
- * @return never {@code null}.
+ * Given an input log10 log10Likelihoods matrix, subtract off the maximum of each read column so that each column's maximum is zero for numerical
+ * stability. (This is akin to dividing each read column by its maximum in non-log space). Then exponentiate to enter non-log space, mutating
+ * the log10Likelihoods matrix in-place. Finally, record the sum of all log-10 subtractions, which is the total amount in log10 space
+ * that must later be added to the overall likelihood, which is a sum over all reads (product in npon-log space).
+ * @param log10Likelihoods and input log-10 likelihoods matrix
*/
- private double[] readLikelihoodComponentsByAlleleCount(final LikelihoodMatrix likelihoods) {
- final int readCount = likelihoods.evidenceCount();
- final int alleleDataSize = readCount * (ploidy + 1);
+ private static Pair rescaledNonLogLikelihoods(final LikelihoodMatrix log10Likelihoods) {
+ final int alleleCount = log10Likelihoods.numberOfAlleles();
+ final double[][] log10LikelihoodsByAlleleAndRead = log10Likelihoods.asRealMatrix().getData();
- // frequency1Offset = readCount to skip the useless frequency == 0. So now we are at the start frequency == 1
- // frequency1Offset += alleleDataSize to skip to the next allele index data location (+ readCount) at each iteration.
- for (int a = 0, frequency1Offset = readCount; a < alleleCount; a++, frequency1Offset += alleleDataSize) {
- likelihoods.copyAlleleLikelihoods(a, readAlleleLikelihoodByAlleleCount, frequency1Offset);
+ final int readCount = log10Likelihoods.evidenceCount();
+ final double[] perReadMaxima = new double[readCount];
+ Arrays.fill(perReadMaxima, 0, readCount, Double.NEGATIVE_INFINITY);
- // p = 2 because the frequency == 1 we already have it.
- for (int frequency = 2, destinationOffset = frequency1Offset + readCount; frequency <= ploidy; frequency++) {
- final double log10frequency = MathUtils.log10(frequency);
- for (int r = 0, sourceOffset = frequency1Offset; r < readCount; r++) {
- readAlleleLikelihoodByAlleleCount[destinationOffset++] =
- readAlleleLikelihoodByAlleleCount[sourceOffset++] + log10frequency;
- }
+ // find the maximum log-likelihood over all alleles for each read
+ // note how we traverse by read for cache-friendliness
+ for (int a = 0; a < alleleCount; a++) {
+ for (int r = 0; r < readCount; r++) {
+ perReadMaxima[r] = FastMath.max(perReadMaxima[r], log10LikelihoodsByAlleleAndRead[a][r]);
}
}
- return readAlleleLikelihoodByAlleleCount;
- }
-
- /**
- * Returns the ploidy for this genotype likelihood calculator.
- * @return 0 or greater.
- */
- public int ploidy() {
- return ploidy;
- }
- /**
- * Returns the total number of alleles for this genotype calculator.
- * @return the number of alleles considered by this calculator.
- */
- public int alleleCount() {
- return alleleCount;
- }
-
- /**
- * Returns the likelihood index given the allele counts.
- *
- * @param alleleCountArray the query allele counts. This must follow the format returned by
- * {@link GenotypeAlleleCounts#copyAlleleCounts} with 0 offset.
- *
- * @throws IllegalArgumentException if {@code alleleCountArray} is not a valid {@code allele count array}:
- *
- *
is {@code null},
- *
or its length is not even,
- *
or it contains any negatives,
- *
or the count sum does not match the calculator ploidy,
- *
or any of the alleles therein is negative or greater than the maximum allele index.
- *
- *
- * @return 0 or greater but less than {@link #genotypeCount}.
- */
- public int alleleCountsToIndex(final int ... alleleCountArray) {
- Utils.nonNull(alleleCountArray, "the allele counts cannot be null");
- Utils.validateArg((alleleCountArray.length & 1) == 0, "the allele counts array cannot have odd length");
- alleleHeap.clear();
- for (int i = 0; i < alleleCountArray.length; i += 2) {
- final int index = alleleCountArray[i];
- final int count = alleleCountArray[i+1];
- Utils.validateArg(count >= 0, "no allele count can be less than 0");
- for (int j = 0; j < count; j++) {
- alleleHeap.add(index);
+ // subtract these maxima
+ for (int a = 0; a < alleleCount; a++) {
+ for (int r = 0; r < readCount; r++) {
+ log10LikelihoodsByAlleleAndRead[a][r] -= perReadMaxima[r];
}
}
- return alleleHeapToIndex();
- }
- /**
- * Transforms the content of the heap into an index.
- *
- *
- * The heap contents are flushed as a result, so is left ready for another use.
- *