Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fasta gz support in gatk #5120

Merged
merged 3 commits into from
Sep 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -410,14 +410,14 @@ public List<ActivityProfileState> getSupportingStates() {
/**
* See #getActiveRegionReference but using the span including regions not the extended span
*/
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
public byte[] getFullReference( final ReferenceSequenceFile referenceReader ) {
return getFullReference(referenceReader, 0);
}

/**
* See #getActiveRegionReference but using the span including regions not the extended span
*/
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
public byte[] getFullReference( final ReferenceSequenceFile referenceReader, final int padding ) {
return getReference(referenceReader, padding, spanIncludingReads);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;

Expand Down Expand Up @@ -34,7 +32,7 @@ public final class ReferenceFileSource implements ReferenceDataSource {
*/
public ReferenceFileSource(final Path fastaPath) {
// Will throw a UserException if the .fai and/or .dict are missing
reference = CachingIndexedFastaSequenceFile.checkAndCreate(Utils.nonNull(fastaPath));
reference = new CachingIndexedFastaSequenceFile(Utils.nonNull(fastaPath));
}

/**
Expand All @@ -50,7 +48,7 @@ public ReferenceFileSource(final Path fastaPath) {
*/
public ReferenceFileSource(final Path fastaPath, final boolean preserveFileBases) {
// Will throw a UserException if the .fai and/or .dict are missing
reference = CachingIndexedFastaSequenceFile.checkAndCreate(Utils.nonNull(fastaPath), preserveFileBases);
reference = new CachingIndexedFastaSequenceFile(Utils.nonNull(fastaPath), preserveFileBases);
}

/**
Expand Down Expand Up @@ -96,11 +94,6 @@ public SAMSequenceDictionary getSequenceDictionary() {
*/
@Override
public void close() {
try {
reference.close();
}
catch ( IOException e ) {
throw new GATKException("Error closing reference file", e);
}
reference.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ public static class MissingReference extends UserException {
public static class MissingIndex extends UserException {
private static final long serialVersionUID = 0L;

public MissingIndex(String message){
super(message);
}

public MissingIndex(String file, String message) {
super(String.format("An index is required but was not found for file %s. %s", file, message));
}
Expand Down Expand Up @@ -256,6 +260,14 @@ public MissingReferenceFaiFile( final Path indexPath, final Path fastaPath ) {
}
}

public static class MissingReferenceGziFile extends UserException {
private static final long serialVersionUID = 0L;
public MissingReferenceGziFile( final Path gziPath, final Path fastaPath ) {
super(String.format("Fasta bgzip index file %s for reference %s does not exist. A gzi index can be created using bgzip.",
gziPath.toUri(), fastaPath.toUri()));
}
}

public static class MissingReferenceDictFile extends UserException {
private static final long serialVersionUID = 0L;
public MissingReferenceDictFile( final Path dictFile, final Path fastaFile ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -147,12 +149,8 @@ public static SimpleInterval getPaddedReferenceLoc(final AssemblyRegion region,
}

public static CachingIndexedFastaSequenceFile createReferenceReader(final String reference) {
try {
// fasta reference reader to supplement the edges of the reference sequence
return new CachingIndexedFastaSequenceFile(IOUtils.getPath(reference));
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(IOUtils.getPath(reference), e);
}
// fasta reference reader to supplement the edges of the reference sequence
return new CachingIndexedFastaSequenceFile(IOUtils.getPath(reference));
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
Expand All @@ -14,18 +12,14 @@
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.FileNotFoundException;
import java.util.*;
import java.util.ArrayList;
import java.util.List;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
import java.nio.file.Path;
import java.util.Collection;
import java.util.List;


/**
Expand Down Expand Up @@ -221,10 +215,10 @@ public void onTraversalStart() {
if (hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF && hcArgs.maxMnpDistance > 0) {
throw new CommandLineException.BadArgumentValue("Non-zero maxMnpDistance is incompatible with GVCF mode.");
}
final ReferenceSequenceFile referenceReader = getReferenceReader(referenceArguments);

final VariantAnnotatorEngine variantAnnotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(),
hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE);
hcEngine = new HaplotypeCallerEngine(hcArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), referenceReader, variantAnnotatorEngine);
hcEngine = new HaplotypeCallerEngine(hcArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), getReferenceReader(referenceArguments), variantAnnotatorEngine);

// The HC engine will make the right kind (VCF or GVCF) of writer for us
final SAMSequenceDictionary sequenceDictionary = getHeaderForReads().getSequenceDictionary();
Expand All @@ -233,14 +227,8 @@ public void onTraversalStart() {
}

private static CachingIndexedFastaSequenceFile getReferenceReader(ReferenceInputArgumentCollection referenceArguments) {
final CachingIndexedFastaSequenceFile referenceReader;
final Path reference = IOUtils.getPath(referenceArguments.getReferenceFileName());
try {
referenceReader = new CachingIndexedFastaSequenceFile(reference);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(reference, e);
}
return referenceReader;
return new CachingIndexedFastaSequenceFile(reference);
}

@Override
Expand All @@ -257,5 +245,6 @@ public void closeTool() {
if ( hcEngine != null ) {
hcEngine.shutdown();
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
Expand Down Expand Up @@ -45,6 +46,7 @@
import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter;

import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -144,7 +146,7 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
* @param createBamOutIndex true to create an index file for the bamout
* @param createBamOutMD5 true to create an md5 file for the bamout
* @param readsHeader header for the reads
* @param referenceReader reader to provide reference data
* @param referenceReader reader to provide reference data, this reference reader will be closed when {@link #shutdown()} is called
* @param annotationEngine variantAnnotatorEngine with annotations to process already added
*/
public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, boolean createBamOutIndex, boolean createBamOutMD5, final SAMFileHeader readsHeader, ReferenceSequenceFile referenceReader, VariantAnnotatorEngine annotationEngine ) {
Expand Down Expand Up @@ -710,6 +712,13 @@ public void shutdown() {
if ( haplotypeBAMWriter.isPresent() ) {
haplotypeBAMWriter.get().close();
}
if ( referenceReader != null){
try {
referenceReader.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}


}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
/**
* Common interface for assembly-haplotype vs reads likelihood engines.
*/
public interface ReadLikelihoodCalculationEngine {
public interface ReadLikelihoodCalculationEngine extends AutoCloseable {

enum Implementation {
/**
Expand Down Expand Up @@ -46,5 +46,6 @@ public ReadLikelihoods<Haplotype> computeReadLikelihoods(AssemblyResultSet assem
* This method must be called when the client is done with likelihood calculations.
* It closes any open resources.
*/
@Override
public void close();
}
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ public void shutdown() {
likelihoodCalculationEngine.close();
aligner.close();
haplotypeBAMWriter.ifPresent(writer -> writer.close());
referenceReader.close();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.Tuple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand Down Expand Up @@ -60,7 +61,7 @@ public class OverhangFixingManager {
private final GATKReadWriter writer;

// fasta reference reader to check overhanging edges in the exome reference sequence
private final IndexedFastaSequenceFile referenceReader;
private final ReferenceSequenceFile referenceReader;

// the genome unclippedLoc parser
private final GenomeLocParser genomeLocParser;
Expand Down Expand Up @@ -92,7 +93,7 @@ public class OverhangFixingManager {
public OverhangFixingManager(final SAMFileHeader header,
final GATKReadWriter writer,
final GenomeLocParser genomeLocParser,
final IndexedFastaSequenceFile referenceReader,
final ReferenceSequenceFile referenceReader,
final int maxRecordsInMemory,
final int maxMismatchesInOverhangs,
final int maxBasesInOverhangs,
Expand Down Expand Up @@ -467,7 +468,7 @@ public Splice(final String contig, final int start, final int end) {
loc = genomeLocParser.createGenomeLoc(contig, start, end);
}

public void initialize(final IndexedFastaSequenceFile referenceReader) {
public void initialize(final ReferenceSequenceFile referenceReader) {
reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
package org.broadinstitute.hellbender.tools.walkers.rnaseq;

import htsjdk.samtools.*;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.TwoPassReadWalker;
Expand All @@ -21,7 +22,12 @@
import org.broadinstitute.hellbender.utils.SATagBuilder;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.CigarUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.FileNotFoundException;
import java.io.IOException;
Expand Down Expand Up @@ -127,7 +133,7 @@ public boolean requiresReference() {

private SAMFileGATKReadWriter outputWriter;
private OverhangFixingManager overhangManager;
private IndexedFastaSequenceFile referenceReader;
private ReferenceSequenceFile referenceReader;
SAMFileHeader header;

@Override
Expand Down Expand Up @@ -163,15 +169,10 @@ public ReadTransformer makePostReadFilterTransformer(){
@Override
public void onTraversalStart() {
header = getHeaderForSAMWriter();
try {
referenceReader = new CachingIndexedFastaSequenceFile(referenceArguments.getReferencePath());
GenomeLocParser genomeLocParser = new GenomeLocParser(getBestAvailableSequenceDictionary());
outputWriter = createSAMWriter(IOUtils.getPath(OUTPUT), false);
overhangManager = new OverhangFixingManager(header, outputWriter, genomeLocParser, referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs, processSecondaryAlignments);

} catch (FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(referenceArguments.getReferencePath(), ex);
}
referenceReader = new CachingIndexedFastaSequenceFile(referenceArguments.getReferencePath());
GenomeLocParser genomeLocParser = new GenomeLocParser(getBestAvailableSequenceDictionary());
outputWriter = createSAMWriter(IOUtils.getPath(OUTPUT), false);
overhangManager = new OverhangFixingManager(header, outputWriter, genomeLocParser, referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs, processSecondaryAlignments);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.*;
import htsjdk.tribble.Feature;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -553,13 +549,15 @@ public static boolean isIntervalFile(final String str, final boolean checkExists
* @return A map of contig names with their sizes.
*/
public static Map<String, Integer> getContigSizes(final Path reference) {
final ReferenceSequenceFile referenceSequenceFile = createReference(reference);
final List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSequenceFile.getSequenceDictionary()).toList();
final Map<String, Integer> lengths = new LinkedHashMap<>();
for (final GenomeLoc loc: locs) {
lengths.put(loc.getContig(), loc.size());
try(final CachingIndexedFastaSequenceFile referenceSequenceFile = new CachingIndexedFastaSequenceFile(reference)) {
final List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(
referenceSequenceFile.getSequenceDictionary()).toList();
final Map<String, Integer> lengths = new LinkedHashMap<>();
for (final GenomeLoc loc : locs) {
lengths.put(loc.getContig(), loc.size());
}
return lengths;
}
return lengths;
}

/**
Expand Down Expand Up @@ -1000,10 +998,6 @@ public static List<List<SimpleInterval>> groupIntervalsByContig(final List<Simpl
return intervalGroups;
}

private static ReferenceSequenceFile createReference(final Path fastaPath) {
return CachingIndexedFastaSequenceFile.checkAndCreate(fastaPath);
}

private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(final List<GenomeLoc> sorted) {
final LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<>();
GenomeLoc last = null;
Expand Down
Loading