Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overhaul of Mutect2 filtering #5688

Merged
merged 15 commits into from
Mar 11, 2019
Merged
Binary file modified docs/mutect/mutect.pdf
Binary file not shown.
654 changes: 372 additions & 282 deletions docs/mutect/mutect.tex

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions scripts/mutect2_wdl/mutect2_nio.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,13 @@ workflow Mutect2 {
}
}

call MergeStats {
input:
stats = M2.stats,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) {
call CollectSequencingArtifactMetrics {
input:
Expand Down Expand Up @@ -372,6 +379,7 @@ workflow Mutect2 {
output_name = filtered_name,
compress = compress,
preemptible_attempts = preemptible_attempts,
mutect_stats = MergeStats.merged_stats,
max_retries = max_retries,
contamination_table = CalculateContamination.contamination_table,
maf_segments = CalculateContamination.maf_segments,
Expand Down Expand Up @@ -466,6 +474,8 @@ workflow Mutect2 {
output {
File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
File filtering_stats = Filter.filtering_stats
File mutect_stats = MergeStats.merged_stats
File? contamination_table = CalculateContamination.contamination_table

File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf
Expand Down Expand Up @@ -629,6 +639,7 @@ task M2 {
-O "${output_vcf}" \
${true='--bam-output bamout.bam' false='' make_bamout} \
${"--orientation-bias-artifact-priors " + artifact_prior_table} \
--stats mutect2.stats \
${m2_extra_args}
>>>

Expand All @@ -648,6 +659,7 @@ task M2 {
File output_bamOut = "bamout.bam"
String tumor_sample = read_string("tumor_name.txt")
String normal_sample = read_string("normal_name.txt")
File stats = "mutect2.stats"
}
}

Expand Down Expand Up @@ -755,6 +767,51 @@ task MergeBamOuts {
}
}


task MergeStats {
# inputs
Array[File]+ stats

File? gatk_override

# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? max_retries
Int? disk_space
Int? cpu
Boolean use_ssd = false

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 2000
Int command_mem = machine_mem - 1000

command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}


gatk --java-options "-Xmx${command_mem}m" MergeMutectStats \
-stats ${sep=" -stats " stats} -O merged.stats

}

runtime {
docker: gatk_docker
bootDiskSizeGb: 12
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 10]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
maxRetries: select_first([max_retries, 0])
cpu: select_first([cpu, 1])
}

output {
File merged_stats = "merged.stats"
}
}

task CollectSequencingArtifactMetrics {
# inputs
File ref_fasta
Expand Down Expand Up @@ -971,6 +1028,7 @@ task Filter {
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File? mutect_stats
File? contamination_table
File? maf_segments
String? m2_extra_filtering_args
Expand Down Expand Up @@ -999,6 +1057,8 @@ task Filter {
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${"--tumor-segmentation " + maf_segments} \
${"-stats " + mutect_stats} \
--filtering-stats filtering.stats \
${m2_extra_filtering_args}
}

Expand All @@ -1015,6 +1075,7 @@ task Filter {
output {
File filtered_vcf = "${output_vcf}"
File filtered_vcf_index = "${output_vcf_index}"
File filtering_stats = "filtering.stats"
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.engine.filters.CountingVariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.stream.StreamSupport;

/**
* A VariantWalker that makes multiple passes through the variants.
* This allows the user to store internal states during early passes, which the user can then
* process and access during later passes
**/
public abstract class MultiplePassVariantWalker extends VariantWalker {

protected abstract int numberOfPasses();

/**
* Overrides the default, single-pass traversal framework of {@link VariantWalkerBase}
*/
@Override
public void traverse(){
final CountingVariantFilter countingVariantFilter = makeVariantFilter();
final CountingReadFilter readFilter = makeReadFilter();

for (int n = 0; n < numberOfPasses(); n++) {
logger.info("Starting pass " + n + " through the variants");
final int nCopyInLambda = n;
traverseVariants(countingVariantFilter, readFilter, (vc, rc, ref, fc) -> nthPassApply(vc, rc, ref, fc, nCopyInLambda));
logger.info("Finished pass " + n + " through the variants");

// Process the data accumulated during the nth pass
afterNthPass(n);
}

logger.info(countingVariantFilter.getSummaryLine());
logger.info(readFilter.getSummaryLine());
}

/**
*
* nth pass through the variants. The user may store data in instance variables of the walker
* and process them in {@link #afterNthPass} before making a second pass
*
* @param variant A variant record in a vcf
* @param readsContext Reads overlapping the current variant. Will be empty if a read source (e.g. bam) isn't provided
* @param referenceContext Reference bases spanning the current variant
* @param featureContext A record overlapping the current variant from an auxiliary data source (e.g. gnomad)
* @param n Which pass it is (zero-indexed)
*/
protected abstract void nthPassApply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext,
final int n);
/**
* Process the data collected during the first pass. This method is called between the two traversals
*/
protected abstract void afterNthPass(final int n);

private void traverseVariants(final VariantFilter variantFilter, final CountingReadFilter readFilter, final VariantConsumer variantConsumer){
StreamSupport.stream(getSpliteratorForDrivingVariants(), false)
.filter(variantFilter)
.forEach(variant -> {
final SimpleInterval variantInterval = new SimpleInterval(variant);
variantConsumer.consume(variant,
new ReadsContext(reads, variantInterval, readFilter),
new ReferenceContext(reference, variantInterval),
new FeatureContext(features, variantInterval));
progressMeter.update(variantInterval);
});
}

@FunctionalInterface
private interface VariantConsumer {
void consume(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext reference, final FeatureContext features);
}

/**
* Make final to hide it from subclasses
*/
@Override
public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {}
}
Original file line number Diff line number Diff line change
@@ -1,94 +1,44 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.engine.filters.CountingVariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.exceptions.GATKException;

import java.util.stream.StreamSupport;

/**
* A VariantWalker that makes two passes through the variants.
* This allows the user to store internal states during the first pass, which the user can then
* process and access during the second pass
**/
public abstract class TwoPassVariantWalker extends VariantWalker {
/**
* Overrides the default, single-pass traversal framework of {@link VariantWalkerBase}
*/
public abstract class TwoPassVariantWalker extends MultiplePassVariantWalker {
@Override
public void traverse(){
final CountingVariantFilter countingVariantFilter = makeVariantFilter();
final CountingReadFilter readFilter = makeReadFilter();
final protected int numberOfPasses() { return 2; }

// First pass through the variants
logger.info("Starting first pass through the variants");
traverseVariants(countingVariantFilter, readFilter, this::firstPassApply);
logger.info("Finished first pass through the variants");

// Process the data accumulated during the first pass
afterFirstPass();
@Override
final protected void nthPassApply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext,
final int n) {
if (n == 0) {
firstPassApply(variant, readsContext, referenceContext, featureContext);
} else if (n == 1) {
secondPassApply(variant, readsContext, referenceContext, featureContext);
} else {
throw new GATKException.ShouldNeverReachHereException("This two-pass walker should never reach (zero-indexed) pass " + n);
}
}

// Second pass
logger.info("Starting second pass through the variants");
traverseVariants(countingVariantFilter, readFilter, this::secondPassApply);

logger.info(countingVariantFilter.getSummaryLine());
logger.info(readFilter.getSummaryLine());
@Override
final protected void afterNthPass(final int n) {
if (n == 0) {
afterFirstPass();
} else if (n > 1) {
throw new GATKException.ShouldNeverReachHereException("This two-pass walker should never reach (zero-indexed) pass " + n);
}
}

/**
*
* First pass through the variants. The user may store data in instance variables of the walker
* and process them in {@link #afterFirstPass} before making a second pass
*
* @param variant A variant record in a vcf
* @param readsContext Reads overlapping the current variant. Will be empty if a read source (e.g. bam) isn't provided
* @param referenceContext Reference bases spanning the current variant
* @param featureContext A record overlapping the current variant from an auxiliary data source (e.g. gnomad)
*/
protected abstract void firstPassApply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext );
protected abstract void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext);

/**
* Process the data collected during the first pass. This method is called between the two traversals
*/
protected abstract void afterFirstPass();

/**
*
* Having seen all of the variants in a vcf, make a second pass through the variants
*
* See argument docs of {@link #firstPassApply} and {@link VariantWalkerBase#apply}
**/
protected abstract void secondPassApply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext);
protected abstract void secondPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext);

private void traverseVariants(final VariantFilter variantFilter, final CountingReadFilter readFilter, final VariantConsumer variantConsumer){
StreamSupport.stream(getSpliteratorForDrivingVariants(), false)
.filter(variantFilter)
.forEach(variant -> {
final SimpleInterval variantInterval = new SimpleInterval(variant);
variantConsumer.consume(variant,
new ReadsContext(reads, variantInterval, readFilter),
new ReferenceContext(reference, variantInterval),
new FeatureContext(features, variantInterval));
progressMeter.update(variantInterval);
});
}

@FunctionalInterface
private interface VariantConsumer {
void consume(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext reference, final FeatureContext features);
}

/**
* Make final to hide it from subclasses
*/
@Override
public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,8 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ExampleProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.TwoPassVariantWalker;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.io.File;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import org.broadinstitute.hellbender.tools.exome.orientationbiasvariantfilter.OrientationBiasFilterer;
import org.broadinstitute.hellbender.tools.exome.orientationbiasvariantfilter.OrientationBiasUtils;
import org.broadinstitute.hellbender.tools.exome.orientationbiasvariantfilter.PreAdapterOrientationScorer;
import org.broadinstitute.hellbender.tools.walkers.mutect.FilterMutectCalls;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls;
import org.broadinstitute.hellbender.utils.artifacts.Transition;
import picard.analysis.artifacts.CollectSequencingArtifactMetrics;
import picard.analysis.artifacts.SequencingArtifactMetrics;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ public static Map<Path, Properties> getAndValidateDataSourcesFromPaths( final St

// Make sure we only have 1 of this data source:
if ( names.contains(configFileProperties.getProperty("name")) ) {
throw new UserException.BadInput("Error: contains more than one dataset of name: "
throw new UserException.BadInput("ERROR: contains more than one dataset of name: "
+ configFileProperties.getProperty("name") + " - one is: " + configFilePath.toUri().toString());
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
Expand All @@ -16,13 +19,13 @@
import org.broadinstitute.hellbender.engine.MultiVariantWalkerGroupedOnStart;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2FiltersArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.StandardAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

Expand Down
Loading