Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added toggle for selecting resource-matching strategies and miscellaneous minor fixes to new annotation-based filtering tools. #8049

Merged
merged 1 commit into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@
* ...
* -A annotation_N \
* --mode SNP \
* --resource snp-training,training=true snp-training.vcf \
* --resource snp-calibration,calibration=true snp-calibration.vcf \
* --resource:snp-training,training=true snp-training.vcf \
* --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
* --resource indel-training,training=true indel-training.vcf \
* --resource indel-calibration,calibration=true indel-calibration.vcf \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
* -O extract
* </pre>
* </p>
Expand All @@ -195,11 +195,11 @@
* ...
* -A annotation_N \
* --mode SNP \
* --resource snp-training,training=true snp-training.vcf \
* --resource snp-calibration,calibration=true snp-calibration.vcf \
* --resource:snp-training,training=true snp-training.vcf \
* --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
* --resource indel-training,training=true indel-training.vcf \
* --resource indel-calibration,calibration=true indel-calibration.vcf \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
* --maximum-number-of-unlableled-variants 1000000
* -O extract
* </pre>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;

import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
Expand All @@ -17,6 +18,7 @@
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
Expand Down Expand Up @@ -46,7 +48,7 @@
* walker, performing the operations:
*
* - nthPassApply(n = 0)
* - if variant/alleles pass filters and variant-type/overlapping-resource checks, then:
* - if variant/alleles pass filters and variant-type/resource-match checks, then:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that I just changed "overlapping" to "resource-matching" everywhere to distinguish what we're doing here from straightforward genomic overlapping.

* - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection
* - write variant/alleles with labels appended to a sites-only VCF file
* - afterNthPass(n = 0)
Expand Down Expand Up @@ -89,13 +91,18 @@ public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVarian
public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
public static final String RESOURCE_MATCHING_STRATEGY_LONG_NAME = "resource-matching-strategy";
public static final String OMIT_ALLELES_IN_HDF5_LONG_NAME = "omit-alleles-in-hdf5";
public static final String DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME = "do-not-gzip-vcf-output";

public static final String ANNOTATIONS_HDF5_SUFFIX = ".annot.hdf5";

public static final String RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING = "This site was labeled as %s according to resources";

enum ResourceMatchingStrategy {
START_POSITION, START_POSITION_AND_GIVEN_REPRESENTATION, START_POSITION_AND_MINIMAL_REPRESENTATION
}

@Argument(
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
Expand Down Expand Up @@ -144,10 +151,24 @@ public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVarian
@Argument(
fullName = DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME,
doc = "If true, do not trust that unfiltered records in the resources contain only polymorphic sites. " +
"This may increase runtime.",
"This may increase runtime if the resources are not sites-only VCFs.",
optional = true)
private boolean doNotTrustAllPolymorphic = false;


@Argument(
fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
doc = "The strategy to use for determining whether an input variant is present in a resource " +
"in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
"START_POSITION: Start positions of input and resource variants must match. " +
"START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(in their given representations) must also be non-empty. " +
"START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(after converting alleles to their minimal representations) must also be non-empty. " +
"This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
"in which the minimal representations of the input and resource alleles must match.",
optional = true)
private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
@Argument(
fullName = OMIT_ALLELES_IN_HDF5_LONG_NAME,
doc = "If true, omit alleles in output HDF5 files in order to decrease file sizes.",
Expand Down Expand Up @@ -283,7 +304,9 @@ VCFHeader constructVCFHeader(final List<String> sortedLabels) {
.collect(Collectors.toCollection(TreeSet::new));
hInfo.add(GATKVCFHeaderLines.getFilterLine(VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
hInfo = VcfUtils.updateHeaderContigLines(hInfo, null, sequenceDictionary, true);
if (sequenceDictionary != null) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This avoids a complaint raised if the VCF is missing contig lines in the header.

hInfo = VcfUtils.updateHeaderContigLines(hInfo, referenceArguments.getReferencePath(), sequenceDictionary, true);
}
hInfo.addAll(getDefaultToolVCFHeaderLines());
return new VCFHeader(hInfo);
}
Expand All @@ -303,57 +326,72 @@ final List<Triple<List<Allele>, VariantType, TreeSet<String>>> extractVariantMet
}
if (!useASAnnotations) {
// in non-allele-specific mode, get a singleton list of the triple
// (list of alt alleles passing variant-type and overlapping-resource checks, variant type, set of labels)
// (list of alt alleles passing variant-type and resource-match checks, variant type, set of labels)
final VariantType variantType = VariantType.getVariantType(vc);
if (variantTypesToExtract.contains(variantType)) {
final TreeSet<String> overlappingResourceLabels = findOverlappingResourceLabels(vc, null, null, featureContext);
if (isExtractUnlabeled || !overlappingResourceLabels.isEmpty()) {
return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, overlappingResourceLabels));
final TreeSet<String> matchingResourceLabels = findMatchingResourceLabels(vc, null, featureContext);
if (isExtractUnlabeled || !matchingResourceLabels.isEmpty()) {
return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, matchingResourceLabels));
}
}
} else {
// in allele-specific mode, get a list containing the triples
// (singleton list of alt allele, variant type, set of labels)
// corresponding to alt alleles that pass variant-type and overlapping-resource checks
// corresponding to alt alleles that pass variant-type and resource-match checks
return vc.getAlternateAlleles().stream()
.filter(a -> !GATKVCFConstants.isSpanningDeletion(a))
.filter(a -> variantTypesToExtract.contains(VariantType.getAlleleSpecificVariantType(vc, a)))
.map(a -> Triple.of(Collections.singletonList(a), VariantType.getAlleleSpecificVariantType(vc, a),
findOverlappingResourceLabels(vc, vc.getReference(), a, featureContext)))
findMatchingResourceLabels(vc, a, featureContext)))
.filter(t -> isExtractUnlabeled || !t.getRight().isEmpty())
.collect(Collectors.toList());
}
// if variant-type and overlapping-resource checks failed, return an empty list
// if variant-type and resource-match checks failed, return an empty list
return Collections.emptyList();
}

private TreeSet<String> findOverlappingResourceLabels(final VariantContext vc,
final Allele refAllele,
final Allele altAllele,
final FeatureContext featureContext) {
final TreeSet<String> overlappingResourceLabels = new TreeSet<>();
private TreeSet<String> findMatchingResourceLabels(final VariantContext vc,
final Allele altAllele,
final FeatureContext featureContext) {
final TreeSet<String> matchingResourceLabels = new TreeSet<>();
for (final FeatureInput<VariantContext> resource : resources) {
final List<VariantContext> resourceVCs = featureContext.getValues(resource, featureContext.getInterval().getStart());
for (final VariantContext resourceVC : resourceVCs) {
if (useASAnnotations && !doAllelesMatch(refAllele, altAllele, resourceVC)) {
if (useASAnnotations && !doAllelesMatch(vc.getReference(), altAllele, resourceVC)) {
continue;
}
if (isValidVariant(vc, resourceVC, !doNotTrustAllPolymorphic)) {
if (isMatchingVariant(vc, resourceVC, !doNotTrustAllPolymorphic, resourceMatchingStrategy)) {
resource.getTagAttributes().entrySet().stream()
.filter(e -> e.getValue().equals("true"))
.map(Map.Entry::getKey)
.forEach(overlappingResourceLabels::add);
.forEach(matchingResourceLabels::add);
}
}
}
return overlappingResourceLabels;
return matchingResourceLabels;
}

private static boolean isValidVariant(final VariantContext vc,
final VariantContext resourceVC,
final boolean trustAllPolymorphic) {
return resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) &&
(trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples());
private static boolean isMatchingVariant(final VariantContext vc,
final VariantContext resourceVC,
final boolean trustAllPolymorphic,
final ResourceMatchingStrategy resourceMatchingStrategy) {
if (resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) &&
(trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples())) { // this is the check originally performed by VQSR
switch (resourceMatchingStrategy) {
case START_POSITION:
return true;
case START_POSITION_AND_GIVEN_REPRESENTATION:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if these strategies add any noticeable overhead if there are a lot of multiallelics, but I haven't noticed anything out of the ordinary on my runs so far.

// we further require that at least one alt allele is present in the resource alt alleles, but don't reconcile representations
return !Sets.intersection(Sets.newHashSet(vc.getAlternateAlleles()), Sets.newHashSet(resourceVC.getAlternateAlleles())).isEmpty();
case START_POSITION_AND_MINIMAL_REPRESENTATION:
// we further require that at least one alt allele is present in the resource alt alleles, and do reconcile representations
return vc.getAlternateAlleles().stream()
.anyMatch(altAllele -> GATKVariantContextUtils.isAlleleInList(vc.getReference(), altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles()));
default:
throw new GATKException.ShouldNeverReachHereException("Unknown ResourceMatchingStrategy.");
}
}
return false;
}

private static boolean doAllelesMatch(final Allele refAllele,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,12 @@
* -A annotation_N \
* --model-prefix model_dir \
* --mode SNP \
* --resource snp-training,training=true snp-training.vcf \
* --resource snp-calibration,calibration=true snp-calibration.vcf \
* --resource:snp-training,training=true snp-training.vcf \
* --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
* --resource indel-training,training=true indel-training.vcf \
* --resource indel-calibration,calibration=true indel-calibration.vcf \
* --resource extracted,extracted=true extract.vcf.gz \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
* --resource:extracted,extracted=true extract.vcf.gz \
* --snp-calibration-sensitivity-threshold 0.99 \
* --indel-calibration-sensitivity-threshold 0.99 \
* -O output
Expand All @@ -216,9 +216,9 @@
* -A snp_annotation_N \
* --model-prefix model_dir \
* --mode SNP \
* --resource snp-training,training=true snp-training.vcf \
* --resource snp-calibration,calibration=true snp-calibration.vcf \
* --resource extracted,extracted=true snp-extract.vcf.gz \
* --resource:snp-training,training=true snp-training.vcf \
* --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --resource:extracted,extracted=true snp-extract.vcf.gz \
* --snp-calibration-sensitivity-threshold 0.99 \
* -O intermediate-output
*
Expand All @@ -229,9 +229,9 @@
* -A indel_annotation_M \
* --model-prefix model_dir \
* --mode INDEL \
* --resource indel-training,training=true indel-training.vcf \
* --resource indel-calibration,calibration=true indel-calibration.vcf \
* --resource extracted,extracted=true indel-extract.vcf.gz \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
* --resource:extracted,extracted=true indel-extract.vcf.gz \
* --indel-calibration-sensitivity-threshold 0.99 \
* -O output
* </pre>
Expand Down