Skip to content

Commit

Permalink
Bug fix in flow allele filtering (#8775)
Browse files Browse the repository at this point in the history
* Fixed a bug that prevented filtering by SOR in many cases
  • Loading branch information
ilyasoifer authored May 2, 2024
1 parent ec39c37 commit 5c32785
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ funcotator_tmp

#Test generated dot files
test*.dot
.vscode/
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import com.google.common.annotations.VisibleForTesting;

/**
* Filtering haplotypes that contribute weak alleles to the genotyping.
*
Expand Down Expand Up @@ -278,7 +280,8 @@ private AlleleLikelihoods<GATKRead, Haplotype> subsetHaplotypesByAlleles(final A
* @param sorThreshold only variants with SOR above threshold will be considered
* @return list of alleles that can be removed
*/
private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
@VisibleForTesting
List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final List<Double> collectedSORs,
final List<Event> alleles,
final double qualThreshold,
final double sorThreshold) {
Expand All @@ -303,9 +306,11 @@ private List<Event> identifyBadAlleles(final List<Integer> collectedRPLs, final
//we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD
//we will first filter the one with the lowest QUAL.
logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount));
for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) {
if (!result.contains(alleles.get(sorIndices[i]))) {
result.add(alleles.get(sorIndices[i]));
for (int i = sorIndices.length-1 ; (i >= 0) ; i--) {
if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){
if (!result.contains(alleles.get(sorIndices[i]))) {
result.add(alleles.get(sorIndices[i]));
}
}
}
logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import com.google.common.collect.ImmutableList;
import htsjdk.samtools.TextCigarCodec;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AlleleFiltering;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AlleleFilteringHC;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.genotyper.*;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.haplotype.EventMap;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
Expand Down Expand Up @@ -298,5 +301,96 @@ public void testNotFilterLoneWeakAllele(){

}

@Test //check that we filter strong allele with high SOR
public void testFilterDistantHindelSor() {

// create haplotypes
List<Haplotype> haplotypeList = new ArrayList<>();
final byte[] fullReferenceWithPadding = "CAGGCATG".getBytes();
Haplotype haplotype = new Haplotype(fullReferenceWithPadding, true, 0, TextCigarCodec.decode("8M"));
haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 108));
haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0));
haplotypeList.add(haplotype);

haplotype = new Haplotype("CAGGCATTG".getBytes(), false, 0, TextCigarCodec.decode("7M1I1M"));
haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 109));

haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0));
haplotypeList.add(haplotype);
haplotype = new Haplotype("CAGGCATTTG".getBytes(), false, 0, TextCigarCodec.decode("7M2I1M"));
haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 110));

haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0));
haplotypeList.add(haplotype);

AlleleList<Haplotype> haplotypes = new IndexedAlleleList<>(haplotypeList);
SampleList samples = new IndexedSampleList(Arrays.asList("sm1"));

List<GATKRead> readList = new ArrayList<>(30);
Map<String, List<GATKRead>> ebs = new HashMap<>();
ebs.put("sm1", readList);

for (int i = 0; i < 40; i++) {
readList.add(ArtificialReadUtils.createArtificialRead("20M"));
}


double[][] values = {
{ 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3,
0, 3, 0, 3, 0,
3 },
{ 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0,
3, 0, 3, 0, 3,
0 },
{ 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0,0,10,0,0,0,10,0,0,0}
};
for (int i = 0; i < 40; i++) {
if (i % 4 == 0) {
readList.get(i).setIsReverseStrand(true);
}
}

AlleleLikelihoods<GATKRead, Haplotype> lks = new AlleleLikelihoods<>(samples, haplotypes, ebs);
LikelihoodMatrix<GATKRead, Haplotype> lkm = lks.sampleMatrix(0);
for (int i = 0; i < lks.numberOfAlleles(); i++) {
for (int j = 0; j < lkm.evidenceCount(); j++) {
lkm.set(i, j, values[i][j]);
}
}

HaplotypeCallerArgumentCollection hcArgs = new HaplotypeCallerArgumentCollection();
HaplotypeCallerGenotypingEngine genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samples,
!hcArgs.doNotRunPhysicalPhasing, false);

AlleleFiltering alleleFiltering = new AlleleFilteringHC(hcArgs, null, genotypingEngine);
AlleleLikelihoods<GATKRead, Haplotype> filtered_lks = alleleFiltering.filterAlleles(lks, 0, new HashSet<>());
Assert.assertEquals(filtered_lks.alleles(), haplotypeList.subList(0, 2));
}

@Test
public void testIdentifyBadAlleles(){
Event a = new Event("chr1", 10, Allele.create("A",true), Allele.create("T", false));
Event b = new Event("chr1", 10, Allele.create("T",true), Allele.create("G", false));
Event c = new Event("chr1", 10, Allele.create("C", true), Allele.create("G", false));

List<Event> events = List.of(a,b,c);
List<Integer> rpls = List.of(10,20,0);
List<Double> sors = List.of(0.0,1.0,3.5);
HaplotypeCallerGenotypingEngine ge = new HaplotypeCallerGenotypingEngine(new HaplotypeCallerArgumentCollection(),
SampleList.singletonSampleList("test"), false, false);
AlleleFiltering af = new AlleleFilteringHC(null, null,ge);
List<Event> badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3);
Assert.assertEquals(badAlleles, List.of(b, a, c));
rpls = List.of(-100, -200, 0);
sors = List.of(0.0,1.0,3.5);
badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3);
Assert.assertEquals(badAlleles, List.of(c));

rpls = List.of(-100, -200, -300);
sors = List.of(0.0,1.0,3.5);
badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3);
Assert.assertEquals(badAlleles, List.of(c));


}
}

0 comments on commit 5c32785

Please sign in to comment.