Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a bug in pileup mode relating to number of haplotypes #8489

Merged
merged 10 commits into from
Aug 28, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -470,12 +470,21 @@ static Set<Haplotype> filterPileupHaplotypes(final Set<Haplotype> onlyNewHaploty
.filter(kmer -> kmerReadCounts.getOrDefault(kmer, new MutableInt(0)).intValue() > 0)
.count()));

// if there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best
final long minimumPassingScore = scores.values().stream().sorted(Comparator.reverseOrder()).skip(numPileupHaplotypes - 1).findFirst().get();
return onlyNewHaplotypes.stream().filter(h -> scores.get(h) >= minimumPassingScore).collect(Collectors.toSet());
// Get the minimum passing score from all haplotypes:
final long minimumPassingScore = scores.values().stream()
.sorted(Comparator.reverseOrder())
.skip(numPileupHaplotypes - 1)
.findFirst().get();

// If there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best, with
// final ordering determined by string representation (for determinism).
return onlyNewHaplotypes.stream()
.filter(h -> scores.get(h) >= minimumPassingScore)
.sorted(Comparator.comparing(Haplotype::getBaseString))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually i think you need to change this... you need Comparitor.comparing(score).thenComparing (need to check the syntax on that) because now this is shuffling the score > minPassingScore haplotypes which is probably fine most of the time but could end up not selecting the top scorign hap.

.limit(numPileupHaplotypes)
.collect(Collectors.toCollection(LinkedHashSet::new));
}


private static Set<Kmer> kmerizeSequence(byte[] sequence, int kmerSize){
final Set<Kmer> allKmers = new LinkedHashSet<>();
final int stopPosition = sequence.length - kmerSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,23 +675,27 @@
return; // nothing to do
}

if (debug) {
logger.info("Number of haplotypes pre-pileup injection: " + this.getHaplotypeCount());

Check warning on line 679 in src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java#L679

Added line #L679 was not covered by tests
}

final Haplotype refHaplotype = getReferenceHaplotype();
final Map<Integer, List<Event>> assembledEventByStart = getVariationEvents(argumentCollection.maxMnpDistance).stream()
.collect(Collectors.groupingBy(Event::getStart));
final Collection<Event> assembledIndels = getVariationEvents(argumentCollection.maxMnpDistance).stream().
filter(Event::isIndel).collect(Collectors.toList());
filter(Event::isIndel).toList();

Set<Haplotype> baseHaplotypes = new TreeSet<>();
baseHaplotypes.addAll(getHaplotypeList().stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(Haplotype::getScore).reversed())
.limit(AssemblyBasedCallerUtils.NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList()));
.toList());

//TODO its unclear whether the correct answer here is to use the hardclipped pileup reads (which we used in generating the pileup alleles for specificty reasons)
//TODO or if it would be more accurate to use the softclipped bases here in filtering down the haplotypes. I suspect the latter but I will evaluate later.
Map<Kmer, MutableInt> kmerReadCounts = AssemblyBasedCallerUtils.getKmerReadCounts(region.getHardClippedPileupReads(), argumentCollection.pileupDetectionArgs.filteringKmerSize);

for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).collect(Collectors.toList())) {
for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).toList()) {

if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: " + event);

Expand All @@ -705,7 +709,7 @@
continue;
}

final Set<Haplotype> newPileupHaplotypes = new HashSet<>();
final Set<Haplotype> newPileupHaplotypes = new LinkedHashSet<>();
for (final Haplotype baseHaplotype : baseHaplotypes) {
final Haplotype insertedHaplotype = makeHaplotypeWithInsertedEvent(baseHaplotype, refHaplotype, event, aligner, argumentCollection.getHaplotypeToReferenceSWParameters());
if (insertedHaplotype != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1309,28 +1309,37 @@
put(new Kmer("GAA"), 1);
}};

final List<Haplotype> bigHaplotypeList = Arrays.asList(hapA,hapB,hapB,hapB,hapB,hapB,hapC,hapD,hapF,hapF,hapF,hapF,hapF,hapF);

Check warning on line 1312 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1312

Added line #L1312 was not covered by tests

// NOTE: we limit the number of haplotypes that are returned by the `numPileupHaplotypes` parameter.

Object[][] tests = new Object[][] {
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,5,3,Arrays.asList(hapA,hapB,hapC,hapD)}, //returns all when no filtering required

// These haplotypes are all equivalent, these test stability of the filtering
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,4,3,Arrays.asList(hapA,hapB,hapC,hapD)},

Check warning on line 1323 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1320-L1323

Added lines #L1320 - L1323 were not covered by tests

// Repetitive kmers in hapF don't get double counted
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3,Arrays.asList(hapF, hapD)}, //currently repeated kmers only count as singular evidence
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF, hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3, List.of(hapD) }, //currently repeated kmers only count as singular evidence

Check warning on line 1327 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1326-L1327

Added lines #L1326 - L1327 were not covered by tests

// These tests demonstrate that the weights in the map don't matter
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapB,hapC,hapD)}, // Despite hapD having good support it is not weighted higher
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3, List.of(hapD) },
jonn-smith marked this conversation as resolved.
Show resolved Hide resolved
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapC,hapD)}, // Despite hapD having good support it is not weighted higher

Check warning on line 1332 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1330-L1332

Added lines #L1330 - L1332 were not covered by tests

// Test of the output when only one hap has support
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3,Arrays.asList(hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA, hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA)},

Check warning on line 1336 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1335-L1336

Added lines #L1335 - L1336 were not covered by tests
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,3,3,Arrays.asList(hapD,hapA,hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB,hapF)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB)},

Check warning on line 1338 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1338

Added line #L1338 was not covered by tests

// Test of when there are a LOT of haplotypes and we want to limit the number returned:
new Object[]{bigHaplotypeList, hapDKmers, 1, 3, List.of(hapD)},
new Object[]{bigHaplotypeList, hapDKmers, 2, 3, List.of(hapA, hapD)},

Check warning on line 1342 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1341-L1342

Added lines #L1341 - L1342 were not covered by tests
};

return tests;
Expand All @@ -1345,7 +1354,8 @@
final List<Haplotype> expected) {
final Map<Kmer, MutableInt> counts = kmerReadCounts.entrySet().stream()
.collect(Collectors.toMap(entry -> entry.getKey(), entry -> new MutableInt(entry.getValue())));
Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

final Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

Check warning on line 1358 in src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java

View check run for this annotation

Codecov / codecov/patch

src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java#L1358

Added line #L1358 was not covered by tests

Assert.assertEquals(actual, new HashSet<>(expected));
}
Expand Down
Loading