Huge simplication of genotyping likelihoods calculations -- no change…

… in output
broadinstitute · May 18, 2023 · fdff6db · fdff6db
1 parent b013901
commit fdff6db
Show file tree

Hide file tree

Showing 89 changed files with 1,942 additions and 2,941 deletions.
diff --git a/...java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java b/...java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
@@ -1,15 +1,14 @@
 package org.broadinstitute.hellbender.tools.copynumber.models;
 
 import org.apache.commons.math3.special.Gamma;
+import org.apache.commons.math3.util.CombinatoricsUtils;
 import org.apache.commons.math3.util.FastMath;
 import org.broadinstitute.hellbender.utils.NaturalLogUtils;
 
 import java.util.List;
 import java.util.stream.IntStream;
 
 import static org.apache.commons.math3.util.FastMath.sqrt;
-import static org.broadinstitute.hellbender.utils.MathUtils.log10Factorial;
-import static org.broadinstitute.hellbender.utils.MathUtils.log10ToLog;
 
 /**
  * Contains likelihood methods for the allele-fraction model.
@@ -87,10 +86,7 @@ static double hetLogLikelihood(final AlleleFractionGlobalParameters parameters,
                 - n * log(majorFraction + minorFraction * lambda0RefMinor);
         final double refMinorLogLikelihood = logNotPi + logcRefMinor + Gamma.logGamma(rhoRefMinor) - rhoRefMinor * log(tauRefMinor);
 
-        // changing the factorial implementation below may introduce non-negligible numerical differences;
-        // note https://github.com/broadinstitute/gatk/pull/7652
-        final double outlierLogLikelihood = logPi + log10ToLog(log10Factorial(a) + log10Factorial(r) - log10Factorial(a + r + 1));
-
+        final double outlierLogLikelihood = logPi - Math.log(a + r + 1) - CombinatoricsUtils.binomialCoefficientLog(a+r,a);
         return NaturalLogUtils.logSumExp(altMinorLogLikelihood, refMinorLogLikelihood, outlierLogLikelihood);
     }
 
@@ -165,6 +161,6 @@ private static double biasPosteriorEffectiveBeta(final double lambda0, final dou
     }
 
     private static double log(final double x) {
-        return FastMath.log(Math.max(EPSILON, x));
+        return Math.log(Math.max(EPSILON, x));
     }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
@@ -162,11 +162,11 @@ public CopyRatioState.OutlierIndicators sample(final RandomGenerator rng,
                                                        final CopyRatioSegmentedData data) {
             logger.debug("Sampling outlier indicators...");
             final double outlierUnnormalizedLogProbability =
-                    FastMath.log(state.outlierProbability()) + outlierUniformLogLikelihood;
+                    Math.log(state.outlierProbability()) + outlierUniformLogLikelihood;
 //            final double notOutlierUnnormalizedLogProbabilityPrefactor =
-//                    FastMath.log(1. - state.outlierProbability()) - 0.5 * FastMath.log(2 * Math.PI * state.variance());
+//                    Math.log(1. - state.outlierProbability()) - 0.5 * Math.log(2 * Math.PI * state.variance());
             final double notOutlierUnnormalizedLogProbabilityPrefactor =
-                    FastMath.log((1. - state.outlierProbability()) / FastMath.sqrt(2 * Math.PI * state.variance()));
+                    Math.log((1. - state.outlierProbability()) / FastMath.sqrt(2 * Math.PI * state.variance()));
             final List<Boolean> indicators = new ArrayList<>(data.getNumPoints());
             for (int segmentIndex = 0; segmentIndex < data.getNumSegments(); segmentIndex++) {
                 final List<CopyRatioSegmentedData.IndexedCopyRatio> indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segmentIndex);

diff --git a/.../hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java b/.../hellbender/tools/copynumber/utils/genotyping/NaiveHeterozygousPileupGenotypingUtils.java
@@ -254,6 +254,6 @@ private static double calculateHomozygousLogRatio(final AllelicCount allelicCoun
         final double betaOneMinusError = Beta.regularizedBeta(1 - genotypingBaseErrorRate, r + 1, n - r + 1);
         final double betaHom = betaError + betaAll - betaOneMinusError;
         final double betaHet = betaOneMinusError - betaError;
-        return FastMath.log(betaHom) - FastMath.log(betaHet);
+        return Math.log(betaHom) - Math.log(betaHet);
     }
 }
diff --git a/.../org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/.../org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
@@ -11,9 +11,7 @@
 import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotationData;
 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.AlleleSubsettingUtils;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
 import org.broadinstitute.hellbender.utils.GenotypeUtils;
 import org.broadinstitute.hellbender.utils.Utils;
@@ -34,7 +32,6 @@
 @SuppressWarnings({"rawtypes","unchecked"}) //TODO fix uses of untyped Comparable.
 public final class ReferenceConfidenceVariantContextMerger {
 
-    private static final GenotypeLikelihoodCalculators calculators = new GenotypeLikelihoodCalculators();
     private static VCFHeader vcfInputHeader = null;
     protected final VariantAnnotatorEngine annotatorEngine;
     private final boolean doSomaticMerge;
@@ -571,7 +568,6 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
         // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
         // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
         final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
-        final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
 
         for ( final Genotype g : vc.getGenotypes() ) {
             final String name;
@@ -584,20 +580,17 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
             final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
             if (!doSomaticMerge) {
                 if (g.hasPL() || g.hasAD()) {
-                    int[]  perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
+                    int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
                     if (g.hasPL()) {
-                        // lazy initialization of the genotype index map by ploidy.
                         final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
-                                ? calculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles, calculators) //probably horribly slow
+                                ? GenotypeIndexCalculator.newToOldGenotypeMap(ploidy, perSampleIndexesOfRelevantAlleles) //probably horribly slow
                                 : genotypeIndexMapsByPloidy[ploidy];
-                        final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
-                        genotypeBuilder.PL(PLs);
+                        genotypeBuilder.PL(generatePL(g, genotypeIndexMapByPloidy));
                     }
                     if (g.hasAD()) {
-                        final int[] AD = AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles);
-                        genotypeBuilder.AD(AD);
+                        genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
                     }
-                // clean up low confidence hom refs for better annotations later
+                //clean up low confidence hom refs for better annotations later
                 } else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
                     genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
                 }

diff --git a/...in/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/...in/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
@@ -277,7 +277,7 @@ private static double probability(final PileupSummary site, final double contami
     }
 
     private static double segmentLogLikelihood(final List<PileupSummary> segment, final double contamination, final double errorRate, final double minorAlleleFraction) {
-        return segment.stream().mapToDouble(site -> FastMath.log(MathUtils.sum(genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction)))).sum();
+        return segment.stream().mapToDouble(site -> Math.log(MathUtils.sum(genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction)))).sum();
     }
 
     private static double modelLogLikelihood(final List<List<PileupSummary>> segments, final double contamination, final double errorRate, final List<Double> mafs) {

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java b/...ain/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
@@ -4,15 +4,17 @@
 import com.google.common.primitives.Doubles;
 import com.google.common.primitives.Ints;
 import htsjdk.variant.variantcontext.*;
-import htsjdk.variant.vcf.*;
+import htsjdk.variant.vcf.VCFConstants;
+import htsjdk.variant.vcf.VCFFormatHeaderLine;
+import htsjdk.variant.vcf.VCFHeader;
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
-import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
 import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
+import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
 import org.broadinstitute.hellbender.utils.MathUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.collections.Permutation;
+import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
 import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
 import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
@@ -37,8 +39,6 @@ private AlleleSubsettingUtils() {}  // prevent instantiation
 
     private static final OneShotLogger attributesRemovedOneShotLogger = new OneShotLogger(AlleleSubsettingUtils.class);
 
-    private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
-
     public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
                                                  final List<Allele> originalAlleles,
                                                  final List<Allele> allelesToKeep,
@@ -47,6 +47,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
         //TODO: if other usages of this method should update or remove A,R, or G length annotations then header parsing is necessary and the method below should be used
         return subsetAlleles(originalGs, defaultPloidy, originalAlleles, allelesToKeep, gpc, assignmentMethod, Collections.emptyList());
     }
+
     /**
      * Create the new GenotypesContext with the subsetted PLs and ADs
      *
@@ -399,12 +400,10 @@ static double[] calculateLikelihoodSums(final VariantContext vc, final int defau
             final double GLDiffBetweenRefAndBestVariantGenotype = Math.abs(glsVector[indexOfMostLikelyVariantGenotype] - glsVector[PL_INDEX_OF_HOM_REF]);
             final int ploidy = genotype.getPloidy() > 0 ? genotype.getPloidy() : defaultPloidy;
 
-            final int[] alleleCounts = new GenotypeLikelihoodCalculators()
-                    .getInstance(ploidy, vc.getNAlleles()).genotypeAlleleCountsAt(indexOfMostLikelyVariantGenotype)
-                    .alleleCountsByIndex(vc.getNAlleles() - 1);
+            final GenotypeAlleleCounts mostLikelyGenotypeAlleleCounts = GenotypesCache.get(ploidy, indexOfMostLikelyVariantGenotype);
 
-            for (int allele = 1; allele < alleleCounts.length; allele++) {
-                if (alleleCounts[allele] > 0) {
+            for (int allele = 1; allele < vc.getNAlleles(); allele++) {
+                if (mostLikelyGenotypeAlleleCounts.containsAllele(allele)) {
                     likelihoodSums[allele] += GLDiffBetweenRefAndBestVariantGenotype;
                 }
             }
@@ -428,10 +427,7 @@ public static int[] subsettedPLIndices(final int ploidy, final List<Allele> orig
         final int[] result = new int[GenotypeLikelihoods.numLikelihoods(newAlleles.size(), ploidy)];
         final Permutation<Allele> allelePermutation = new IndexedAlleleList<>(originalAlleles).permutation(new IndexedAlleleList<>(newAlleles));
 
-        final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, originalAlleles.size());
-        for (int oldPLIndex = 0; oldPLIndex < glCalc.genotypeCount(); oldPLIndex++) {
-            final GenotypeAlleleCounts oldAlleleCounts = glCalc.genotypeAlleleCountsAt(oldPLIndex);
-
+        for (final GenotypeAlleleCounts oldAlleleCounts : GenotypeAlleleCounts.iterable(ploidy, originalAlleles.size())) {
             final boolean containsOnlyNewAlleles = IntStream.range(0, oldAlleleCounts.distinctAlleleCount())
                     .map(oldAlleleCounts::alleleIndexAt).allMatch(allelePermutation::isKept);
 
@@ -441,8 +437,8 @@ public static int[] subsettedPLIndices(final int ploidy, final List<Allele> orig
                 final int[] newAlleleCounts = IntStream.range(0, newAlleles.size()).flatMap(newAlleleIndex ->
                         IntStream.of(newAlleleIndex, oldAlleleCounts.alleleCountFor(allelePermutation.fromIndex(newAlleleIndex)))).toArray();
 
-                final int newPLIndex = glCalc.alleleCountsToIndex(newAlleleCounts);
-                result[newPLIndex] = oldPLIndex;
+                final int newPLIndex = GenotypeIndexCalculator.alleleCountsToIndex(newAlleleCounts);
+                result[newPLIndex] = oldAlleleCounts.index();
             }
         }
         return  result;
@@ -492,39 +488,6 @@ public static int[] getIndexesOfRelevantAllelesForGVCF(final List<Allele> remapp
         return indexMapping;
     }
 
-    public static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles, final int position, final Genotype g) {
-        Utils.nonEmpty(remappedAlleles);
-        Utils.nonEmpty(targetAlleles);
-
-        final int[] indexMapping = new int[targetAlleles.size()];
-
-        // the reference likelihoods should always map to each other (even if the alleles don't)
-        indexMapping[0] = 0;
-
-        for ( int i = 1; i < targetAlleles.size(); i++ ) {
-            // if there's more than 1 spanning deletion (*) allele then we need to use the best one
-            if (targetAlleles.get(i) == Allele.SPAN_DEL && g.hasPL()) {
-                final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL);
-                if (occurrences > 1) {
-                    final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy());
-                    if (indexOfBestDel == -1) {
-                        throw new IllegalArgumentException("At position " + position + " targetAlleles contains a spanning deletion, but remappedAlleles does not.");
-                    }
-                    indexMapping[i] = indexOfBestDel;
-                    continue;
-                }
-            }
-
-            final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
-            if (indexOfRemappedAllele == -1) {
-                throw new IllegalArgumentException("At position " + position + " targetAlleles contains a " + targetAlleles.get(i) + " allele, but remappedAlleles does not.");
-            }
-            indexMapping[i] = indexOfRemappedAllele;
-        }
-
-        return indexMapping;
-    }
-
     /**
      * Returns the index of the best spanning deletion allele based on AD counts
      *
@@ -539,7 +502,8 @@ private static int indexOfBestDel(final List<Allele> alleles, final int[] PLs, f
 
         for ( int i = 0; i < alleles.size(); i++ ) {
             if ( alleles.get(i) == Allele.SPAN_DEL ) {
-                final int homAltIndex = findHomIndex(GL_CALCS.getInstance(ploidy, alleles.size()), i, ploidy);
+                //In the canonical order, the homozygous genotype of the ith allele is immediately followed by the first genotype containing the (i+1)th allele.
+                final int homAltIndex = (int) GenotypeIndexCalculator.indexOfFirstGenotypeWithAllele(ploidy, i +1) - 1;
                 final int PL = PLs[homAltIndex];
                 if ( PL < bestPL ) {
                     bestIndex = i;
@@ -551,25 +515,6 @@ private static int indexOfBestDel(final List<Allele> alleles, final int[] PLs, f
         return bestIndex;
     }
 
-    /** //TODO simplify these methods
-     * Returns the index of the PL that represents the homozygous genotype of the given i'th allele
-     *
-     * @param i           the index of the allele with the list of alleles
-     * @param ploidy      the ploidy of the sample
-     * @return the hom index
-     */
-    private static int findHomIndex(final GenotypeLikelihoodCalculator calculator, final int i, final int ploidy) {
-        // some quick optimizations for the common case
-        if ( ploidy == 2 )
-            return GenotypeLikelihoods.calculatePLindex(i, i);
-        if ( ploidy == 1 )
-            return i;
-
-        final int[] alleleIndexes = new int[ploidy];
-        Arrays.fill(alleleIndexes, i);
-        return calculator.allelesToIndex(alleleIndexes);
-    }
-
     /**
      * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current
      * alleles from the original AD.