Skip to content

Commit

Permalink
NIO output support for ApplyBQSR (broadinstitute#4424)
Browse files Browse the repository at this point in the history
* ApplyBQSR can output BAMs to GCS

Add path support, and test.
  • Loading branch information
jean-philippe-martin authored and cwhelan committed May 25, 2018
1 parent 8cd5dc8 commit ca9b387
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 26 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.bqsr;

import java.nio.file.Path;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
Expand All @@ -14,6 +15,7 @@
import org.broadinstitute.hellbender.transformers.BQSRReadTransformer;
import org.broadinstitute.hellbender.transformers.ReadTransformer;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
Expand Down Expand Up @@ -75,7 +77,7 @@ public final class ApplyBQSR extends ReadWalker{
private static final Logger logger = LogManager.getLogger(ApplyBQSR.class);

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="Write output to this file")
public File OUTPUT;
public String OUTPUT;

/**
* This argument is required for recalibration of base qualities. The recalibration table is a file produced by
Expand Down Expand Up @@ -103,7 +105,7 @@ public ReadTransformer makePostReadFilterTransformer(){

@Override
public void onTraversalStart() {
outputWriter = createSAMWriter(OUTPUT, true);
outputWriter = createSAMWriter(IOUtils.getPath(OUTPUT), true);
Utils.warnOnNonIlluminaReadGroups(getHeaderForReads(), logger);
}

Expand Down
33 changes: 32 additions & 1 deletion src/main/java/org/broadinstitute/hellbender/utils/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import com.google.common.primitives.Ints;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.tribble.util.ParsingUtils;
import java.io.FileNotFoundException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -524,12 +527,40 @@ public static String calcMD5(final byte[] bytes) {
/**
* Calculates the MD5 for the specified file and returns it as a String
*
* Warning: this loads the whole file into memory, so it's not suitable
* for large files.
*
* @param file file whose MD5 to calculate
* @return file's MD5 in String form
* @throws IOException if the file could not be read
*/
public static String calculateFileMD5( final File file ) throws IOException{
return Utils.calcMD5(FileUtils.readFileToByteArray(file));
return calculatePathMD5(file.toPath());
}

/**
* Calculates the MD5 for the specified file and returns it as a String
*
* Warning: this loads the whole file into memory, so it's not suitable
* for large files.
*
* @param path file whose MD5 to calculate
* @return file's MD5 in String form
* @throws IOException if the file could not be read
*/
public static String calculatePathMD5(final Path path) throws IOException{
// This doesn't have as nice error messages as FileUtils, but it's close.
String fname = path.toUri().toString();
if (!Files.exists(path)) {
throw new FileNotFoundException("File '" + fname + "' does not exist");
}
if (Files.isDirectory(path)) {
throw new IOException("File '" + fname + "' exists but is a directory");
}
if (!Files.isRegularFile(path)) {
throw new IOException("File '" + fname + "' exists but is not a regular file");
}
return Utils.calcMD5(Files.readAllBytes(path));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.SamStreams;
import htsjdk.samtools.cram.build.CramIO;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.util.Arrays;
Expand Down Expand Up @@ -1132,19 +1130,31 @@ public static SAMFileWriter createCommonSAMWriterFromFactory(
* Validate that a file has CRAM contents by checking that it has a valid CRAM file header
* (no matter what the extension).
*
* @param putativeCRAMFile File to check.
* @param putativeCRAMPath File to check.
* @return true if the file has a valid CRAM file header, otherwise false
*/
public static boolean hasCRAMFileContents(final File putativeCRAMFile) {
try (final FileInputStream fileStream = new FileInputStream(putativeCRAMFile);
final BufferedInputStream bis = new BufferedInputStream(fileStream)) {
return SamStreams.isCRAMFile(bis);
public static boolean hasCRAMFileContents(final Path putativeCRAMPath) {
try (final InputStream fileStream = Files.newInputStream(putativeCRAMPath)) {
try (final BufferedInputStream bis = new BufferedInputStream(fileStream)) {
return SamStreams.isCRAMFile(bis);
}
}
catch (IOException e) {
throw new UserException.CouldNotReadInputFile(e.getMessage());
}
}

/**
* Validate that a file has CRAM contents by checking that it has a valid CRAM file header
* (no matter what the extension).
*
* @param putativeCRAMFile File to check.
* @return true if the file has a valid CRAM file header, otherwise false
*/
public static boolean hasCRAMFileContents(final File putativeCRAMFile) {
return hasCRAMFileContents(putativeCRAMFile.toPath());
}

public static boolean isNonPrimary(GATKRead read) {
return read.isSecondaryAlignment() || read.isSupplementaryAlignment() || read.isUnmapped();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
Expand All @@ -36,7 +37,9 @@ public final class SamAssertionUtils {
private static SamReader getReader(final File sam, final ValidationStringency validationStringency, final File reference) {
return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
}

private static SamReader getReader(final Path sam, final ValidationStringency validationStringency, final Path reference) {
return SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(reference).open(sam);
}
/**
* causes an exception if the given sam files aren't equal
* @param actualSam the actual file
Expand All @@ -45,8 +48,12 @@ private static SamReader getReader(final File sam, final ValidationStringency va
* @param reference is allowed to be null
*/
public static void assertSamsEqual(final File actualSam, final File expectedSam, final ValidationStringency validationStringency, final File reference) throws IOException {
assertSamsEqual(actualSam.toPath(), expectedSam.toPath(), validationStringency,
(null==reference?null:reference.toPath()));
}
public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final ValidationStringency validationStringency, final Path reference) throws IOException {
final String equalStringent = samsEqualStringent(actualSam, expectedSam, validationStringency, reference);
Assert.assertNull(equalStringent, "SAM file " + actualSam.getPath() + " differs from expected output:" + expectedSam.getPath() + " " + equalStringent);
Assert.assertNull(equalStringent, "SAM file " + actualSam.toUri().toString() + " differs from expected output:" + expectedSam.toUri().toString() + " " + equalStringent);
}

/**
Expand All @@ -68,6 +75,9 @@ public static void assertSamsEqual(final File actualSam, final File expectedSam,
public static void assertSamsEqual(final File actualSam, final File expectedSam, final File reference) throws IOException {
assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
}
public static void assertSamsEqual(final Path actualSam, final Path expectedSam, final Path reference) throws IOException {
assertSamsEqual(actualSam, expectedSam, ValidationStringency.DEFAULT_STRINGENCY, reference);
}

/**
* causes an exception if the given sam files aren't equal
Expand Down Expand Up @@ -145,6 +155,19 @@ public static String samsEqualLenient(final File actualSam, final File expectedS
* @return null if equal or message string if not equal.
*/
public static String samsEqualStringent(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
return samsEqualStringent(actualSam.toPath(), expectedSam.toPath(), validation, (null==reference?null:reference.toPath()));
}

/**
* Compares SAM/BAM files in a stringent way but not by byte identity (allow reorder of attributes)
* Comparing by MD5s is too strict and comparing by SamComparison is too lenient. So we need this method.
*
* This differs from a byte-to-byte comparison:
* - @PG and @CO lines in headers are ignored in the comparison.
* - each read in the actual file are allowed to have a superset of the attributes of the corresponding read in the expected set
* @return null if equal or message string if not equal.
*/
public static String samsEqualStringent(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
if (sameMD5s(actualSam, expectedSam)) {
return null;
}
Expand All @@ -168,7 +191,22 @@ private static boolean sameMD5s(final File actualSam, final File expectedSam) th
return fileMD5_1.equals(fileMD5_2);
}


private static boolean sameMD5s(final Path actualSam, final Path expectedSam) throws IOException {
final String fileMD5_1 = Utils.calculatePathMD5(actualSam);
final String fileMD5_2 = Utils.calculatePathMD5(expectedSam);
return fileMD5_1.equals(fileMD5_2);
}

private static String compareReads(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
return compareReads(
actualSam.toPath(),
expectedSam.toPath(),
validation,
(null==reference?null:reference.toPath()));
}

private static String compareReads(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
try(final SamReader reader1 = getReader(actualSam, validation, reference);
final SamReader reader2 = getReader(expectedSam, validation, reference)) {
final SAMRecordIterator it1 = reader1.iterator();
Expand All @@ -189,9 +227,9 @@ private static String compareReads(final File actualSam, final File expectedSam,
}
}

private static String equalHeadersIgnoreCOandPG(final File actualSam, final File expectedSam, final ValidationStringency validation, final File reference) throws IOException {
private static String equalHeadersIgnoreCOandPG(final Path actualSam, final Path expectedSam, final ValidationStringency validation, final Path reference) throws IOException {
try(final SamReader reader1 = getReader(actualSam, validation, reference);
final SamReader reader2 = getReader(expectedSam, validation, reference)){
final SamReader reader2 = getReader(expectedSam, validation, reference)){

final SAMFileHeader h1 = reader1.getFileHeader();
final SAMFileHeader h2 = reader2.getFileHeader();
Expand Down Expand Up @@ -376,16 +414,24 @@ public static void assertEqualBamFiles(
* Validate/assert that the contents are CRAM if the extension is .cram
*/
public static void assertCRAMContentsIfCRAM(final File putativeCRAMFile) {
if (IOUtils.isCramFile(putativeCRAMFile)) {
assertCRAMContents(putativeCRAMFile);
Path path = (null==putativeCRAMFile?null:putativeCRAMFile.toPath());
assertCRAMContentsIfCRAM(path);
}

/**
* Validate/assert that the contents are CRAM if the extension is .cram
*/
public static void assertCRAMContentsIfCRAM(final Path putativeCRAMPath) {
if (IOUtils.isCramFile(putativeCRAMPath)) {
assertCRAMContents(putativeCRAMPath);
}
}

/**
* Unconditionally validate/assert that the contents are CRAM
*/
public static void assertCRAMContents(final File putativeCRAMFile) {
Assert.assertTrue(ReadUtils.hasCRAMFileContents(putativeCRAMFile), "should have had CRAM contents: " + putativeCRAMFile.getName());
public static void assertCRAMContents(final Path putativeCRAMPath) {
Assert.assertTrue(ReadUtils.hasCRAMFileContents(putativeCRAMPath), "should have had CRAM contents: " + putativeCRAMPath.toUri().toString());
}

private static void sortSam(final File input, final File output, final File reference, final ValidationStringency stringency) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
package org.broadinstitute.hellbender.tools.walkers.bqsr;

import com.google.common.jimfs.Configuration;
import com.google.common.jimfs.Jimfs;
import htsjdk.samtools.SamReaderFactory;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec;
import org.broadinstitute.hellbender.utils.test.SamAssertionUtils;
import org.testng.Assert;
Expand Down Expand Up @@ -80,8 +86,18 @@ public Object[][] createABQSRTestData() {
return tests.toArray(new Object[][]{});
}

@DataProvider(name = "MiniApplyBQSRTest")
public Object[][] createMiniABQSRTestData() {
List<Object[]> tests = new ArrayList<>();

//Note: these outputs were created using GATK3
tests.add(new Object[]{new ABQSRTest(hiSeqBam, null, ".bam", null, resourceDir + "expected.HiSeq.1mb.1RG.2k_lines.alternate.recalibrated.DIQ.bam")});

return tests.toArray(new Object[][]{});
}

@Test(dataProvider = "ApplyBQSRTest")
public void testApplyBQSR(ABQSRTest params) throws IOException {
public void testApplyBQSRFile(ABQSRTest params) throws IOException {
File outFile = GATKBaseTest.createTempFile("applyBQSRTest", params.outputExtension);
final ArrayList<String> args = new ArrayList<>();
File refFile = null;
Expand All @@ -90,18 +106,77 @@ public void testApplyBQSR(ABQSRTest params) throws IOException {
args.add(new File(params.bam).getAbsolutePath());
args.add("--" + StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME);
args.add(new File(resourceDir + "HiSeq.20mb.1RG.table.gz").getAbsolutePath());
args.add("-O"); args.add(outFile.getAbsolutePath());
args.add("-O");
args.add(outFile.getAbsolutePath());
if (params.reference != null) {
refFile = new File(params.reference);
args.add("-R"); args.add(refFile.getAbsolutePath());
args.add("-R");
args.add(refFile.getAbsolutePath());
if (params.args != null) {
Stream.of(params.args).forEach(arg -> args.add(arg));
}

runCommandLine(args);

SamAssertionUtils.assertSamsEqual(outFile, new File(params.expectedFile), refFile);
}
}

@Test(dataProvider = "MiniApplyBQSRTest")
public void testApplyBQSRPath(ABQSRTest params) throws IOException {
try (FileSystem jimfs = Jimfs.newFileSystem(Configuration.unix())) {
final Path outPath = jimfs.getPath("applyBQSRTest"+params.outputExtension);

final ArrayList<String> args = new ArrayList<>();
Path refPath = null;

args.add("-I");
args.add(new File(params.bam).getAbsolutePath());
args.add("--" + StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME);
args.add(new File(resourceDir + "HiSeq.20mb.1RG.table.gz").getAbsolutePath());
args.add("-O"); args.add(outPath.toUri().toString());
if (params.reference != null) {
File refFile = new File(params.reference);
args.add("-R"); args.add(refFile.getAbsolutePath());
refPath = refFile.toPath();
}
if (params.args != null) {
Stream.of(params.args).forEach(arg -> args.add(arg));
}

runCommandLine(args);

SamAssertionUtils.assertSamsEqual(outPath, new File(params.expectedFile).toPath(), refPath);
}
}

@Test(dataProvider = "ApplyBQSRTest", groups={"bucket"})
public void testApplyBQSRCloud(ABQSRTest params) throws IOException {
// getTempFilePath also deletes the file on exit.
final String outString = BucketUtils.getTempFilePath(getGCPTestStaging() + "tmp/testApplyBQSRCloud", params.outputExtension);
final Path outPath = BucketUtils.getPathOnGcs(outString);
final ArrayList<String> args = new ArrayList<>();
Path refPath = null;

args.add("-I");
args.add(new File(params.bam).getAbsolutePath());
args.add("--" + StandardArgumentDefinitions.BQSR_TABLE_LONG_NAME);
args.add(new File(resourceDir + "HiSeq.20mb.1RG.table.gz").getAbsolutePath());
args.add("-O");
args.add(outString);
if (params.reference != null) {
File refFile = new File(params.reference);
args.add("-R");
args.add(refFile.getAbsolutePath());
refPath = refFile.toPath();
}
if (params.args != null) {
Stream.of(params.args).forEach(arg -> args.add(arg));
}

runCommandLine(args);

SamAssertionUtils.assertSamsEqual(outFile, new File(params.expectedFile), refFile);
SamAssertionUtils.assertSamsEqual(outPath, new File(params.expectedFile).toPath(), refPath);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ public Object[][] testCRAMContentsFailData() {

@Test(dataProvider = "testCRAMContentsSucceed")
public void testAssertCRAMContentsSucceed(File putativeCRAMFile) {
SamAssertionUtils.assertCRAMContents(putativeCRAMFile);
SamAssertionUtils.assertCRAMContents(putativeCRAMFile.toPath());
}

@Test(dataProvider = "testCRAMContentsFail", expectedExceptions=AssertionError.class)
public void testAssertCRAMContentsFail(File putativeCRAMFile) {
SamAssertionUtils.assertCRAMContents(putativeCRAMFile);
SamAssertionUtils.assertCRAMContents(putativeCRAMFile.toPath());
}

@DataProvider(name="testCRAMContentsIfCRAMSucceed")
Expand Down

0 comments on commit ca9b387

Please sign in to comment.