From c2b6beb002e7863d03918ff58b05289f47779b18 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 16 Jul 2024 11:51:04 -0400 Subject: [PATCH 1/4] Add bundle support for FeatureInputs. --- .../cmdline/StandardArgumentDefinitions.java | 4 + .../hellbender/engine/FeatureDataSource.java | 90 ++++++-- .../hellbender/engine/FeatureInput.java | 42 ++++ .../hellbender/engine/MultiVariantWalker.java | 57 ++++- .../hellbender/tools/CreateBundle.java | 202 ++++++++++++++++++ .../hellbender/utils/gcs/BucketUtils.java | 16 +- .../engine/FeatureDataSourceUnitTest.java | 18 +- .../engine/FeatureInputUnitTest.java | 24 ++- .../MultiVariantWalkerIntegrationTest.java | 129 +++++++++-- .../tools/CreateBundleIntegrationTest.java | 176 +++++++++++++++ .../SelectVariantsIntegrationTest.java | 81 +++++++ .../testSelectVariants_BundleWith_idx.vcf | 160 ++++++++++++++ .../testSelectVariants_BundleWith_tbi.vcf | 160 ++++++++++++++ 13 files changed, 1098 insertions(+), 61 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java index 1c8596eb91b..40d6ccde930 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java @@ -56,6 +56,10 @@ private StandardArgumentDefinitions(){} public static final String INTERVALS_SHORT_NAME = "L"; public static final String COMPARISON_SHORT_NAME = "comp"; public static final String READ_INDEX_SHORT_NAME = READ_INDEX_LONG_NAME; + public static final String PRIMARY_INPUT_LONG_NAME = "primary"; + public static final String PRIMARY_INPUT_SHORT_NAME = "PI"; + public static final String SECONDARY_INPUT_LONG_NAME = "secondaryI"; + public static final String SECONDARY_INPUT_SHORT_NAME = "SI"; public static final String LENIENT_SHORT_NAME = "LE"; public static final String READ_VALIDATION_STRINGENCY_SHORT_NAME = "VS"; public static final String SAMPLE_ALIAS_SHORT_NAME = "ALIAS"; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java index bd6fad2f6d2..85eb4d46a92 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java @@ -1,5 +1,10 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.io.bundle.BundleResource; +import htsjdk.beta.io.bundle.BundleResourceType; +import htsjdk.io.IOPath; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Locatable; @@ -148,7 +153,7 @@ public FeatureDataSource(final File featureFile) { * generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) * during queries that produce cache misses. * - * @param featurePath path or URI to source of Features + * @param featurePath path or URI to source of Features (may be a Bundle) */ public FeatureDataSource(final String featurePath) { this(featurePath, null, DEFAULT_QUERY_LOOKAHEAD_BASES, null); @@ -159,7 +164,7 @@ public FeatureDataSource(final String featurePath) { * name. We will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) during queries * that produce cache misses. * - * @param featureFile file containing Features + * @param featureFile file or Bundle containing Features * @param name logical name for this data source (may be null) */ public FeatureDataSource(final File featureFile, final String name) { @@ -170,7 +175,7 @@ public FeatureDataSource(final File featureFile, final String name) { * Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical * name. We will look ahead the specified number of bases during queries that produce cache misses. * - * @param featureFile file containing Features + * @param featureFile file or Bundle containing Features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses */ @@ -181,7 +186,7 @@ public FeatureDataSource(final File featureFile, final String name, final int qu /** * Creates a FeatureDataSource backed by the resource at the provided path. * - * @param featurePath path to file or GenomicsDB url containing features + * @param featurePath path to file or GenomicsDB url or Bundle containing features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs @@ -195,7 +200,7 @@ public FeatureDataSource(final String featurePath, final String name, final int * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (or a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -207,7 +212,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook /** * Creates a FeatureDataSource backed by the resource at the provided path. * - * @param featurePath path to file or GenomicsDB url containing features + * @param featurePath path to file or GenomicsDB url or Bundle containing features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs @@ -224,7 +229,7 @@ public FeatureDataSource(final String featurePath, final String name, final int * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -241,7 +246,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -259,7 +264,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -278,7 +283,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -296,7 +301,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -369,9 +374,26 @@ private static FeatureReader getFeatureReader(final Featu } catch (final ClassCastException e) { throw new UserException("GenomicsDB inputs can only be used to provide VariantContexts.", e); } + } else if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + // the feature input specifies a serialized json bundle file + final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new); + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + // to get the codec we have to use the path of the underlying vcf resource, not the bundle path + final FeatureInput fi = new FeatureInput(vcfPath.getRawInputString(), featureInput.getName()); + final FeatureCodec codec = getCodecForFeatureInput(fi, targetFeatureType, setNameOnCodec); + // propagate the bundle path, not the vcf path, to the reader, so that downstream code can retrieve + // the index path from the bundle + return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); + } else if (featureInput.getParentBundle() != null) { + // the featureInput was created from a bundle list expansion (i.e, MultiVariantWalkers). it has the + // primary resource as the underlying resource path, and the containing bundle attached as the + // "parent bundle". Use the original FI to get the codec, but to get the feature reader, we use + // the FI that contains the bundle path, since the feature reader may require acccess to the index + final FeatureCodec codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec); + return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); } else { final FeatureCodec codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec); - if ( featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION) ) { + if (featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION)) { return new Reader(featureInput, codec); } return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); @@ -419,18 +441,48 @@ private static FeatureReader getFeatureReader(final Featu private static AbstractFeatureReader getTribbleFeatureReader(final FeatureInput featureInput, final FeatureCodec codec, final Function cloudWrapper, final Function cloudIndexWrapper) { Utils.nonNull(codec); try { - // Must get the path to the data file from the codec here: - final String absoluteRawPath = featureInput.getRawInputString(); - // Instruct the reader factory to not require an index. We will require one ourselves as soon as // a query by interval is attempted. final boolean requireIndex = false; - // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. - if (BucketUtils.isEligibleForPrefetching(featureInput)) { - return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new); + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + final Optional vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX); + final String rawIndexResourcePath = + vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null; + + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + if (BucketUtils.isEligibleForPrefetching(vcfPath)) { + final String absoluteRawPath = vcfPath.getRawInputString(); + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(vcfPath.getRawInputString(), rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } + } else if (featureInput.getParentBundle() != null) { + final Bundle vcfBundle = featureInput.getParentBundle(); + // code path for when a user has specified multiple bundles on the command line, so there is no single + // serialized bundle file to access + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + final Optional vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX); + final String rawIndexResourcePath = + vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null; + final String absoluteRawPath = vcfPath.getRawInputString(); + if (BucketUtils.isEligibleForPrefetching(vcfPath)) { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } } else { - return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + final String absoluteRawPath = featureInput.getRawInputString(); + + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + if (BucketUtils.isEligibleForPrefetching(featureInput)) { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } } } catch (final TribbleException e) { throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e); diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java index 53bcb117afc..801c92f13c6 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.engine; import com.google.common.annotations.VisibleForTesting; +import htsjdk.beta.io.bundle.Bundle; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import org.apache.logging.log4j.LogManager; @@ -52,6 +53,11 @@ public final class FeatureInput extends GATKPath implements S */ private transient Class> featureCodecClass; + /** + * retain any containing bundle in case we need to extract other resources from it + */ + private Bundle parentBundle; + /** * Delimiter between the logical name and the file name in the --argument_name logical_name:feature_file syntax */ @@ -129,6 +135,34 @@ public FeatureInput(final String rawInputSpecifier, final String name, final Map setTagAttributes(keyValueMap); } + /** + * Construct a FeatureInput from a Bundle. + * + * @param primaryResourcePath the path for the primary feature resource for this bundle + * @param featureBundle an existing Bundle object; resources in this bundle MUST be IOPathBundleResources (that is, + * they must be backed by an IOPath, not an in-memory object) + * @param name the tag name for this feature input - may be null + */ + public FeatureInput( + final GATKPath primaryResourcePath, + final Bundle featureBundle, + final String name) { + super(primaryResourcePath); + // retain the containing bundle for later so we can interrogate it for other resources, like the index + this.parentBundle = featureBundle; + if (name != null) { + if (primaryResourcePath.getTag() != null) { + logger.warn(String.format( + "FeatureInput: user-provided tag name %s will be replaced with %s", + primaryResourcePath.getTag(), + name)); + } + setTag(name); + } + + } + + /** * Remember the FeatureCodec class for this input the first time it is discovered so we can bypass dynamic codec * discovery when multiple FeatureDataSources are created for the same input. @@ -144,6 +178,14 @@ public void setFeatureCodecClass(final Class> featureCodecCla return this.featureCodecClass; } + /** + * @return the parent bundle for this FeatureInput, if this input was derived from a Bundle. May + * return {@code null}. The returned bundle can be interrogated for companion resources. + */ + public Bundle getParentBundle() { + return parentBundle; + } + /** * creates a name from the given filePath by finding the absolute path of the given input */ diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java index 7f12ff641e2..d1e63da1610 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java @@ -1,5 +1,10 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.io.bundle.BundleResource; +import htsjdk.beta.io.bundle.BundleResourceType; +import htsjdk.beta.plugin.IOUtils; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; @@ -74,17 +79,49 @@ public boolean doDictionaryCrossValidation() { @Override protected void initializeDrivingVariants() { multiVariantInputArgumentCollection.getDrivingVariantPaths().stream().forEach( - f -> { - FeatureInput featureInput = new FeatureInput<>(f); - if (drivingVariantsFeatureInputs.contains(featureInput)) { - throw new UserException.BadInput("Feature inputs must be unique: " + featureInput.toString()); + gatkPath -> { + if (gatkPath.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + // expand any bundle(s) into one or more FeatureInputs (depending on whether it is a single + // bundle or list) + final List bundles = BundleJSON.toBundleList(IOUtils.getStringFromPath(gatkPath), GATKPath::new); + for (final Bundle bundle : bundles) { + if (bundle.getPrimaryContentType().equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { + // use the bundle primary resource as the FeatureInput URI, and tear off and attach the + // individual bundle the bundle to the FI as the parent bundle so downstream code can + // extract other resources from it on demand + // note that if the original value from the user has a tag, we can't use it unless there + // is only one input, since FIs have to be unique + final FeatureInput bundleFI = new FeatureInput<>( + new GATKPath(bundle.getPrimaryResource().getIOPath().get().getURIString()), + bundle, + bundles.size() > 1 ? gatkPath.getTag() : "drivingVariants" + ); + if (drivingVariantsFeatureInputs.contains(bundleFI)) { + throw new UserException.BadInput("Feature inputs must be unique: " + gatkPath); + } + drivingVariantsFeatureInputs.add(bundleFI); + // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value + // of 0 to avoid caching because of windowed queries that need to "look behind" as well. + features.addToFeatureSources(0, bundleFI, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, + referenceArguments.getReferencePath()); + } else { + final BundleResource br = bundle.getPrimaryResource(); + throw new UserException.BadInput( + String.format( + "Feature input bundles must have a primary resource of type %s. Found content type %s with path %s", + BundleResourceType.CT_VARIANT_CONTEXTS, + bundle.getPrimaryContentType(), + br.getIOPath().get())); + } + } + } else { + final FeatureInput featureInput = new FeatureInput<>(gatkPath); + drivingVariantsFeatureInputs.add(featureInput); + // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value + // of 0 to avoid caching because of windowed queries that need to "look behind" as well. + features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, + referenceArguments.getReferencePath()); } - drivingVariantsFeatureInputs.add(featureInput); - - // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value - // of 0 to avoid caching because of windowed queries that need to "look behind" as well. - features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, - referenceArguments.getReferencePath()); } ); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java new file mode 100644 index 00000000000..cf943b0b62a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -0,0 +1,202 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.beta.io.bundle.*; +import htsjdk.beta.plugin.variants.VariantsBundle; +import htsjdk.samtools.util.FileExtensions; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import picard.cmdline.programgroups.OtherProgramGroup; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.StandardOpenOption; +import java.util.*; + +/** + * Create a bundle (JSON) file for use with a GATK tool. + * + * other inputs are NEVER inferred, and must always be provided with a content type tag. + */ +@DocumentedFeature +@CommandLineProgramProperties( + summary = "Create a bundle (JSON) file for use with a GATK tool", + oneLineSummary = "Create a bundle (JSON) file for use with a GATK tool", + programGroup = OtherProgramGroup.class +) +public class CreateBundle extends CommandLineProgram { + protected static final Logger logger = LogManager.getLogger(CreateBundle.class); + + public static final String SUPPRESS_INDEX_RESOLUTION_FULL_NAME = "suppress-index-resolution"; + public static final String OTHER_INPUT_FULL_NAME = "other-input"; + + @Argument(fullName = StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.PRIMARY_INPUT_SHORT_NAME, + doc="Path to the primary bundle input (content type will be inferred if no content type tag is specified)") + GATKPath primaryInput; + + @Argument(fullName = StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.SECONDARY_INPUT_SHORT_NAME, + doc = "Path to a secondary bundle input for" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + + "(usually an index). The type will be inferred if no content type tag is specified. If no "+ + " secondary input is specified, an index for the primary bundle file will be automatically inferred unless " + + SUPPRESS_INDEX_RESOLUTION_FULL_NAME + " is specified.", + optional = true) + GATKPath secondaryInput; + + @Argument(fullName = OTHER_INPUT_FULL_NAME, + shortName = OTHER_INPUT_FULL_NAME, + doc = "Path to other bundle inputs for " + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + + " A content type tag MUST be provided for each other input.", + optional = true) + List otherInputs; + + @Argument(fullName = SUPPRESS_INDEX_RESOLUTION_FULL_NAME, + doc ="Don't attempt to resolve the primary input's index file (defaults to false) if no secondary input is provided", + optional = true) + boolean suppressIndexResolution = false; + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Path the output bundle file (must end with the suffix .json.") + GATKPath outputBundlePath; + + private enum BundleType { + VCF, + REFERENCE, + OTHER + } + private BundleType outputBundleType; + + @Override + protected String[] customCommandLineValidation() { + if (!outputBundlePath.toString().endsWith(".json")){ + return new String[]{"Output bundle path must end with the suffix .json"}; + } + outputBundleType = determinePrimaryContentType(); + return super.customCommandLineValidation(); + } + + @Override + protected Object doWork() { + try (final BufferedWriter writer = Files.newBufferedWriter(outputBundlePath.toPath(), StandardOpenOption.CREATE)) { + final Bundle bundle = switch (outputBundleType) { + case VCF -> createVCFBundle(); + case REFERENCE -> + throw new IllegalArgumentException ("Reference bundles are not yet supported"); + case OTHER -> createOtherBundle(); + }; + writer.write(BundleJSON.toJSON(bundle)); + } catch (final IOException e) { + throw new RuntimeException(String.format("Failed writing bundle to output %s", outputBundlePath), e); + } + return null; + } + + private BundleType determinePrimaryContentType() { + BundleType bundleType; + + // determine the type of bundle to create; consult the tag attributes if any, otherwise try to infer from the + // primary input file extension + final String primaryContentTag = primaryInput.getTag(); + if (primaryContentTag != null && !primaryContentTag.isEmpty()) { + if (primaryContentTag.equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { + bundleType = BundleType.VCF; + } else if (primaryContentTag.equals(BundleResourceType.CT_HAPLOID_REFERENCE)) { + bundleType = BundleType.REFERENCE; + } else { + logger.info(String.format("Primary input content type %s for %s not recognized. A bundle will be created using content typse from the provided argument tags.", + primaryContentTag, + primaryInput)); + bundleType = BundleType.OTHER; + } + } else { + logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the content type from the %s extension.", primaryInput)); + bundleType = inferPrimaryContentType(primaryInput); + } + return bundleType; + } + + private BundleType inferPrimaryContentType(final GATKPath primaryInput) { + logger.info("Attempting to infer bundle content type from file extension."); + if (FileExtensions.VCF_LIST.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + return BundleType.VCF; + } else if (FileExtensions.FASTA.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + return BundleType.REFERENCE; + } else { + throw new IllegalArgumentException(String.format("Unable to infer bundle content type from file extension %s. A content type must be provided as part of the argument.", primaryInput)); + } + } + + private Bundle createVCFBundle() { + final Collection bundleResources = new ArrayList<>(); + + bundleResources.add(new IOPathResource(primaryInput, BundleResourceType.CT_VARIANT_CONTEXTS)); + if (secondaryInput != null) { + final String secondaryContentType = secondaryInput.getTag(); + if (secondaryContentType == null) { + logger.info(String.format("A content type for the secondary input was not provided. Assuming %s is an index.", secondaryInput)); + bundleResources.add(new IOPathResource(secondaryInput, BundleResourceType.CT_VARIANTS_INDEX)); + } else { + bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + } + } else if (!suppressIndexResolution) { + // secondary input is null, and index resolution suppression is off + final Optional indexPath = VariantsBundle.resolveIndex(primaryInput, GATKPath::new); + if (indexPath.isEmpty()) { + throw new IllegalArgumentException( + String.format( + "Could not infer an index for %s, you must either specify the index path as a secondary input on the command line or specify the %s argument.", + primaryInput.getRawInputString(), + SUPPRESS_INDEX_RESOLUTION_FULL_NAME)); + } + bundleResources.add(new IOPathResource(indexPath.get(), BundleResourceType.CT_VARIANTS_INDEX)); + } + if (otherInputs != null) { + for (final GATKPath otherInput : otherInputs) { + final String otherContentType = otherInput.getTag(); + if (otherContentType == null) { + throw new IllegalArgumentException( + String.format( + "A content must be provided for \"other\" input %s.", + otherInput.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(otherInput, otherContentType)); + } + } + } + return new VariantsBundle(bundleResources); + } + + private Bundle createOtherBundle() { + final Collection bundleResources = new ArrayList<>(); + bundleResources.add(new IOPathResource(primaryInput, primaryInput.getTag())); + if (secondaryInput != null) { + final String secondaryContentType = secondaryInput.getTag(); + if (secondaryContentType == null) { + throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided.")); + } else { + bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + } + } + if (otherInputs != null) { + for (final GATKPath otherInput : otherInputs) { + final String otherContentType = otherInput.getTag(); + if (otherContentType == null) { + throw new IllegalArgumentException( + String.format( + "A content type must be provided for \"other\" input %s.", + otherInput.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(otherInput, otherContentType)); + } + } + } + return new Bundle(primaryInput.getTag(), bundleResources); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java index 9eeaffb6223..88cdb2a058b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java @@ -12,6 +12,7 @@ import com.google.cloud.storage.contrib.nio.SeekableByteChannelPrefetcher; import com.google.common.base.Strings; import com.google.common.io.ByteStreams; +import htsjdk.io.IOPath; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.RuntimeIOException; @@ -19,7 +20,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.Utils; @@ -68,20 +68,20 @@ public static boolean isGcsUrl(final String path) { } /** - * Return true if this {@code GATKPath} represents a gcs URI. + * Return true if this {@code IOPath} represents a gcs URI. * @param pathSpec specifier to inspect - * @return true if this {@code GATKPath} represents a gcs URI. + * @return true if this {@code IOPath} represents a gcs URI. */ - public static boolean isGcsUrl(final GATKPath pathSpec) { + public static boolean isGcsUrl(final IOPath pathSpec) { Utils.nonNull(pathSpec); return pathSpec.getScheme().equals(GoogleCloudStorageFileSystem.SCHEME); } /** * @param pathSpec specifier to inspect - * @return true if this {@code GATKPath} represents a remote storage system which may benefit from prefetching (gcs or http(s)) + * @return true if this {@code IOPath} represents a remote storage system which may benefit from prefetching (gcs or http(s)) */ - public static boolean isEligibleForPrefetching(final GATKPath pathSpec) { + public static boolean isEligibleForPrefetching(final IOPath pathSpec) { Utils.nonNull(pathSpec); return isEligibleForPrefetching(pathSpec.getScheme()); } @@ -320,10 +320,10 @@ public static long fileSize(String path) throws IOException { * Note that sub-directories are ignored - they are not recursed into. * Only supports HDFS and local paths. * - * @param pathSpecifier The URL to the file or directory whose size to return + * @param pathSpecifier The IOPath to the file or directory whose size to return * @return the total size of all files in bytes */ - public static long dirSize(final GATKPath pathSpecifier) { + public static long dirSize(final IOPath pathSpecifier) { try { // GCS case (would work with local too) if (isGcsUrl(pathSpecifier)) { diff --git a/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java index c8b0b42b9bf..634121291e4 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java @@ -1,5 +1,8 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.tribble.Feature; import htsjdk.variant.variantcontext.VariantContext; @@ -9,7 +12,6 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -413,6 +415,20 @@ public void testQueryGVCF( final SimpleInterval queryInterval, final List expectedVariantIDs ) { + final GATKPath vcfPath = new GATKPath(QUERY_TEST_GVCF.getAbsolutePath()); + final GATKPath indexPath = VariantsBundle.resolveIndex(vcfPath, GATKPath::new).get(); + final VariantsBundle vcfBundle = new VariantsBundle(vcfPath, indexPath); + final GATKPath bundleFile = new GATKPath(createTempFile("testQueryGVCFThroughBundle", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFile, BundleJSON.toJSON(vcfBundle)); + + try ( FeatureDataSource featureSource = new FeatureDataSource<>(bundleFile.toString()) ) { + final List queryResults = featureSource.queryAndPrefetch(queryInterval); + checkVariantQueryResults(queryResults, expectedVariantIDs, queryInterval); + } + } + /************************************************** * Direct testing on the FeatureCache inner class **************************************************/ diff --git a/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java index 9c09edb1a28..87257d61f6f 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java @@ -1,5 +1,9 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import htsjdk.variant.variantcontext.VariantContext; @@ -7,7 +11,6 @@ import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineArgumentParser; import org.broadinstitute.barclay.argparser.CommandLineException; -import org.broadinstitute.barclay.argparser.CommandLineParser; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.testutils.SparkTestUtils; import org.testng.Assert; @@ -150,6 +153,25 @@ public void testHdfsPathAndName( final String argWithTags, final String inputVal Assert.assertEquals(hdfsInput.getName(), expectedLogicalName, "wrong logical name"); } + @Test(dataProvider = "GcsPathAndNameData", groups={"bucket"}) + public void testGcsBundlePathAndName( final String argWithTags, final String inputValue, final String unusedFeaturePath, final String expectedLogicalName ) { + // reuse the GcsPathAndNameData data provider, but with the input written to a bundle file + final Bundle bundleOfGcsPaths = new VariantsBundle(new GATKPath(inputValue)); + final GATKPath bundleFile = new GATKPath(createTempFile("testGcsBundlePathAndName", ".json").toString()); + IOPathUtils.writeStringToPath(new GATKPath(bundleFile.toString()), BundleJSON.toJSON(bundleOfGcsPaths)); + + final FeatureInput gcsInput = runCommandLineWithTaggedFeatureInput(argWithTags, bundleFile.toString()); + + Assert.assertEquals(gcsInput.getFeaturePath(), bundleFile.toString(), "wrong featurePath"); + if (argWithTags.contains(":")) { + Assert.assertEquals(gcsInput.getName(), expectedLogicalName, "wrong logical name"); + } else { + // if the input arg has no tags, then the expected logical name is the same as the input value, but + // in this test, the input value is actually the bundle json file name, not the original input value + Assert.assertEquals(gcsInput.getName(), bundleFile.toString(), "wrong logical name"); + } + } + @Test public void testFeatureNameSpecified() { final FeatureInput featureInput = runCommandLineWithTaggedFeatureInput("argName:myName", "myFile"); diff --git a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java index dca3fc7ee59..315fbe3458d 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java @@ -1,5 +1,8 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.*; +import htsjdk.io.IOPath; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -8,11 +11,16 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.gcs.BucketUtils; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -77,25 +85,25 @@ public void testDuplicateSources() throws Exception { @DataProvider(name="variantFiles") public Object[][] getVariantFiles() { return new Object[][] - { - { Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), null, 26 }, - { Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), null, 13 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), null, 26 }, - { Arrays.asList( - new File(getTestDataDir(), "splitVariants_1.vcf"), - new File(getTestDataDir(), "splitVariants_2.vcf")), null, 26 }, - - // with intervals - { Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), "1", 14 }, - { Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), "1", 7 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), "1", 14 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), "2:200-600", 3 }, + { + {Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), null, 26}, + {Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), null, 13}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), null, 26}, + {Arrays.asList( + new File(getTestDataDir(), "splitVariants_1.vcf"), + new File(getTestDataDir(), "splitVariants_2.vcf")), null, 26}, + + // with intervals + {Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), "1", 14}, + {Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), "1", 7}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), "1", 14}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), "2:200-600", 3}, }; } @@ -115,10 +123,87 @@ public void testVariantOrder(final List inputFiles, final String interval, Assert.assertEquals(tool.count, expectedCount); } + // tests using bundles + @DataProvider(name="variantBundles") + public Object[][] getVariantBundles() throws IOException { + return new Object[][] + { + {createRemoteBundleForFile( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_1.vcf.idx"), + new File(getTestDataDir(), "interleavedVariants_2.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf.idx") + ), "1", 14}, + {createRemoteBundleForFile( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_1.vcf.idx"), + new File(getTestDataDir(), "interleavedVariants_2.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf.idx") + ), "2:200-600", 3}, + }; + }; + + @Test(dataProvider = "variantBundles") + public void testVariantsFromBundleOrder(final IOPath inputBundleFile, final String interval, final int expectedCount) { + final TestMultiVariantWalker tool = new TestMultiVariantWalker(); + + final List args = new ArrayList<>(); + args.add("--variant"); + args.add(inputBundleFile.getRawInputString()); + if (interval != null) { + args.add("-L"); + args.add(interval); + } + + tool.instanceMain(args.toArray(new String[args.size()])); + Assert.assertEquals(tool.count, expectedCount); + } + + // copy two .vcfs to a temporary remote location; create a bundle list with two bundles, each containing + // a reference to the remote vcf and it's local companion index file, and then write the whole bundle out to + // a temporary remote bundle file + private static IOPath createRemoteBundleForFile( + final File vcf1, + final File index1, + final File vcf2, + final File index2) throws IOException { + //TODO: replace this path with getGCPTestStaging() + final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir"); + final Path remoteDirPath = IOUtils.getPath(remotePath + "/"); + + Files.createDirectory(remoteDirPath); + Assert.assertTrue(Files.exists(remoteDirPath)); + Assert.assertTrue(Files.isDirectory(remoteDirPath)); + final Path remoteVCF1 = IOUtils.getPath(remotePath + "/" + vcf1.getName()); + final Path remoteVCF2 = IOUtils.getPath(remotePath + "/" + vcf2.getName()); + final Path c_remoteVCF1 = Files.copy(vcf1.toPath(), remoteVCF1); + if (!c_remoteVCF1.equals(remoteVCF1)) { + throw new IOException("Not equal " + vcf1 + " to " + remoteVCF1); + } + final Path c_remoteVCF2 = Files.copy(vcf2.toPath(), remoteVCF2); + if (!c_remoteVCF2.equals(remoteVCF2)) { + throw new IOException("Not equal " + vcf2 + " to " + remoteVCF2); + } + + final Bundle variantsBundle1 = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteVCF1.toUri().toString()), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(index1.toURI().toString()), BundleResourceType.CT_VARIANTS_INDEX)) + .build(); + final Bundle variantsBundle2 = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteVCF2.toUri().toString()), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(index2.toURI().toString()), BundleResourceType.CT_VARIANTS_INDEX)) + .build(); + + final List bundles = Arrays.asList(variantsBundle1, variantsBundle2); + final GATKPath remoteBundlePath = new GATKPath(remoteDirPath.resolve("remote_bundle.json").toUri().toString()); + IOPathUtils.writeStringToPath(remoteBundlePath, BundleJSON.toJSON(bundles)); + return remoteBundlePath; + } + @CommandLineProgramProperties( - summary = "TestGATKToolWithFeatures", - oneLineSummary = "TestGATKToolWithFeatures", - programGroup = TestProgramGroup.class + summary = "TestGATKToolWithFeatures", + oneLineSummary = "TestGATKToolWithFeatures", + programGroup = TestProgramGroup.class ) private static final class TestMultiVariantWalkerIterator extends MultiVariantWalker { String expectedIDOrder[]; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java new file mode 100644 index 00000000000..a39cfd450f4 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java @@ -0,0 +1,176 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.beta.io.bundle.*; +import htsjdk.beta.plugin.IOUtils; +import htsjdk.beta.plugin.variants.VariantsBundle; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CreateBundleIntegrationTest extends CommandLineProgramTest { + + // force our local paths to use absolute path names to make BundleResource and IOPath equality checks easier, + // since all serialized JSON bundles always contain absolute path names for local files + private final static String LOCAL_VCF = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf").getURIString(); + private final static String LOCAL_VCF_IDX = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf.idx").getURIString(); + private final static String LOCAL_VCF_GZIP = new GATKPath("src/test/resources/large/NA24385.vcf.gz").getURIString(); + private final static String LOCAL_VCF_TBI = new GATKPath("src/test/resources/large/NA24385.vcf.gz.tbi").getURIString(); + private final static String LOCAL_VCF_WITH_NO_INDEX = new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/count_variants_withSequenceDict_noIndex.vcf").getURIString(); + private final static String CLOUD_VCF = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; + private final static String CLOUD_VCF_IDX = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; + + private final static String PRIMARY_CT = "primary_ct"; + private final static String SECONDARY_CT = "secondary_ct"; + private final static String OTHER_CT = "other_ct"; + + @DataProvider(name = "bundleCases") + public Object[][] bundleCases() { + return new Object[][] { + // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle + + // common VCF bundle cases, with inferred content types + {LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + {LOCAL_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + {LOCAL_VCF_GZIP, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + + {CLOUD_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF))}, + {CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + + // common vcf bundle cases, with explicit content types + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + + // vcf bundle with a vcf, an index, and some other resource with an explicit content type + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, Arrays.asList("someVariantsCompanion.txt"), Arrays.asList("someVariantsCT"), false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), BundleResourceType.CT_VARIANTS_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(new GATKPath("someVariantsCompanion.txt").getURIString()), "someVariantsCT")) + .build()}, + + // "other" bundles + { + LOCAL_VCF, PRIMARY_CT, null, null, null, null, true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .build() + }, + { + LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, null, null, true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) + .build() + }, + { + // frankenbundle with multiple resources + LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, Arrays.asList(LOCAL_VCF_TBI), Arrays.asList(OTHER_CT), true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_TBI), OTHER_CT)) + .build() + }, + }; + } + + @DataProvider(name = "negativeBundleCases") + public Object[][] negativeBundleCases() { + return new Object[][] { + // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle + + // no index file can be inferred + {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + // primary content type not provided and cannot be inferred from the extension + {"primaryFile.ext", null, null, null, null, null, false, null}, + // secondary content type not provided + {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null}, + + // other bundle with other content type not provided + {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", SECONDARY_CT, Arrays.asList("other.txt"), null, false, null}, + // vcf bundle with other content type not provided + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + }; + } + + @Test(dataProvider = "bundleCases") + public void testBundleCases( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean resolveIndex, + final Bundle expectedBundle) { + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + } + + @Test(dataProvider = "negativeBundleCases", expectedExceptions = IllegalArgumentException.class) + public void testNegativeBundleCases( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean resolveIndex, + final Bundle expectedBundle) { + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + } + + private void doCreateBundleTest( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean suppressIndexResolution, + final Bundle expectedBundle) { + final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.json").getAbsolutePath().toString()); + + final List args = new ArrayList<>(); + + args.add("--" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + (primaryInputTag != null ? ":" + primaryInputTag : "")); + args.add(primaryInput); + if (secondaryInput != null) { + args.add("--" + StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME + (secondaryInputTag != null ? ":" + secondaryInputTag : "")); + args.add(secondaryInput); + } + if (otherInputs != null) { + for (int i = 0; i < otherInputs.size(); i++) { + args.add("--" + CreateBundle.OTHER_INPUT_FULL_NAME + ((otherInputTags != null && otherInputTags.get(i) != null) ? ":" + otherInputTags.get(i) : "")); + args.add(otherInputs.get(i) != null ? otherInputs.get(i) : ""); + } + } + if (suppressIndexResolution == true) { + args.add("--" + CreateBundle.SUPPRESS_INDEX_RESOLUTION_FULL_NAME); + } + args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); + args.add(outputPath.toString()); + + runCommandLine(args); + + final Bundle actualBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(outputPath), GATKPath::new); + + // bundle resource order is not preserved when roundtripping through JSON, so compare ignoring order + Assert.assertTrue(Bundle.equalsIgnoreOrder(actualBundle, expectedBundle)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 3664e551a1b..4a50c4e9419 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -1,9 +1,15 @@ package org.broadinstitute.hellbender.tools.walkers.variantutils; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Comparator; import java.util.List; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; +import htsjdk.io.IOPath; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang3.tuple.Pair; @@ -1251,6 +1257,81 @@ public void testSampleSelectionOnNio() throws IOException { IntegrationTestSpec.assertEqualTextFiles(IOUtils.getPath(out), IOUtils.getPath(expectedFile), null); } + @DataProvider(name="cloudBucketBundles") + public Object[][] cloudBucketBundles() { + return new Object[][]{ + { + // cloud vcf with a .idx + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.1.1-65M.vcf"), + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.1.1-65M.vcf.idx"), + "1:20547", + "expected/testSelectVariants_BundleWith_idx.vcf" + }, + { + // cloud vcf with a .tbi + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf.blockgz.gz"), + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf.blockgz.gz.tbi"), + "21:10000128", + "expected/testSelectVariants_BundleWith_tbi.vcf" + } + }; + } + + @Test(dataProvider="cloudBucketBundles", groups = "bucket") + public void testBundleRemoteQueryWithVCFAndIndexInSeparateBuckets( + final IOPath vcfPath, + final IOPath indexPath, + final String queryInterval, + final String expectedOutputFile + ) throws IOException { + // test that a query using a bundle containing remote nio paths to a vcf works when the index is + // in a different bucket than the vcf + final String targetBucketName = BucketUtils.randomRemotePath(getGCPTestStaging(), "testSelectVariantsBundleOnNio", "") + "/"; + IOUtils.deleteOnExit(IOUtils.getPath(targetBucketName)); + final GATKPath targetVCFPath = new GATKPath(targetBucketName + vcfPath.toPath().getFileName()); + + // copy our test vcf (which is already in a bucket with it's accompanying index) to a different bucket where the + // index does not reside, and then create a bundle referencing the copy of the vcf and the original index, so + // the vcf and the index are in separate buckets + Files.copy(vcfPath.toPath(), targetVCFPath.toPath(), StandardCopyOption.REPLACE_EXISTING); + + // create a bundle with the copied vcf and the original index + final VariantsBundle vcfBundle = new VariantsBundle(new GATKPath(targetVCFPath), indexPath); + final GATKPath bundleFilePath = new GATKPath(createTempFile("testSelectVariantsBundleOnNio", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFilePath, BundleJSON.toJSON(vcfBundle)); + + final IntegrationTestSpec spec = new IntegrationTestSpec( + " --variant " + bundleFilePath + + " -L " + queryInterval + + " --suppress-reference-path " // suppress reference file path in output for test differencing + + " -O %s " + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false", + Collections.singletonList(getToolTestDataDir() + expectedOutputFile) + ); + + spec.executeTest("testBundle", this); + } + + @Test(groups = "bucket") + public void testBundleRemoteWithQueryParams() throws IOException { + final VariantsBundle vcfBundle = new VariantsBundle( + new GATKPath(BucketUtils.createSignedUrlToGcsObject("gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf", 1)), + new GATKPath(BucketUtils.createSignedUrlToGcsObject("gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx", 1))); + final IOPath bundleFile = new GATKPath(createTempFile("testSelectVariantsThroughBundle", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFile, BundleJSON.toJSON(vcfBundle)); + + final IntegrationTestSpec spec = new IntegrationTestSpec( + " --variant " + bundleFile.toString() + + " -L 20:10001365 " + + " --suppress-reference-path " // suppress reference file path in output for test differencing + + " -O %s " + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false", + Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_Bundle.vcf") + ); + + spec.executeTest("testBundle", this); + } + // the input test file is a somatic VCF with several many-allelic sites and no PLs. This tests that the tool does not attempt // to create a PL-to-alleles cache, which would cause the tool to freeze. See https://github.com/broadinstitute/gatk/issues/6291 @Test diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf new file mode 100644 index 00000000000..12d3cc32292 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf @@ -0,0 +1,160 @@ +##fileformat=VCFv4.2 +##FILTER= +##GATKCommandLine.SelectVariants= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency."> +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##dbSNP_BUILD_ID=138 +##fileDate=20130806 +##phasing=partial +##source=SelectVariants +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +1 20547 rs202246159 A G . . INT;OTHERKG;RS=202246159;RSPOS=20547;SAO=0;SSR=0;VC=SNV;VP=0x050000080001000002000100;WGT=1;dbSNPBuildID=137 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf new file mode 100644 index 00000000000..4e814e12379 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf @@ -0,0 +1,160 @@ +##fileformat=VCFv4.2 +##FILTER= +##GATKCommandLine.SelectVariants= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency."> +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##dbSNP_BUILD_ID=138 +##fileDate=20130806 +##phasing=partial +##source=SelectVariants +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +21 10000128 rs372777878 C T . . OTHERKG;RS=372777878;RSPOS=10000128;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=138 From 6dcdefff839d5fb10fb7bac8a1c1761ff27af441 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 6 Aug 2024 12:18:19 -0400 Subject: [PATCH 2/4] Add bundle support for references. --- .../cmdline/StandardArgumentDefinitions.java | 8 +- .../hellbender/engine/FeatureInput.java | 5 +- .../hellbender/engine/MultiVariantWalker.java | 14 +- .../hellbender/tools/CreateBundle.java | 330 ++++++++++++++---- .../CachingIndexedFastaSequenceFile.java | 25 +- .../engine/BundleSupportIntegrationTest.java | 34 ++ .../MultiVariantWalkerIntegrationTest.java | 3 +- .../AbstractPrintReadsIntegrationTest.java | 56 ++- .../tools/CreateBundleIntegrationTest.java | 145 +++++--- .../tools/PrintReadsIntegrationTest.java | 57 ++- .../engine/print_reads_bundle_do_not_use.json | 10 + 11 files changed, 546 insertions(+), 141 deletions(-) create mode 100644 src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java index 40d6ccde930..f31c8d55655 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java @@ -56,10 +56,10 @@ private StandardArgumentDefinitions(){} public static final String INTERVALS_SHORT_NAME = "L"; public static final String COMPARISON_SHORT_NAME = "comp"; public static final String READ_INDEX_SHORT_NAME = READ_INDEX_LONG_NAME; - public static final String PRIMARY_INPUT_LONG_NAME = "primary"; - public static final String PRIMARY_INPUT_SHORT_NAME = "PI"; - public static final String SECONDARY_INPUT_LONG_NAME = "secondaryI"; - public static final String SECONDARY_INPUT_SHORT_NAME = "SI"; + public static final String PRIMARY_RESOURCE_LONG_NAME = "primary-resource"; + public static final String PRIMARY_RESOURCE_SHORT_NAME = "PR"; + public static final String SECONDARY_RESOURCE_LONG_NAME = "secondary-resource"; + public static final String SECONDARY_RESOURCE_SHORT_NAME = "SR"; public static final String LENIENT_SHORT_NAME = "LE"; public static final String READ_VALIDATION_STRINGENCY_SHORT_NAME = "VS"; public static final String SAMPLE_ALIAS_SHORT_NAME = "ALIAS"; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java index 801c92f13c6..e0a108d58ed 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java @@ -54,7 +54,8 @@ public final class FeatureInput extends GATKPath implements S private transient Class> featureCodecClass; /** - * retain any containing bundle in case we need to extract other resources from it + * retain the parent (enclosing) bundle from which this feature input is derived, in case we need to extract + * other resources from it */ private Bundle parentBundle; @@ -148,7 +149,7 @@ public FeatureInput( final Bundle featureBundle, final String name) { super(primaryResourcePath); - // retain the containing bundle for later so we can interrogate it for other resources, like the index + // retain the enclosing bundle for later, so we can interrogate it for other resources such as the index this.parentBundle = featureBundle; if (name != null) { if (primaryResourcePath.getTag() != null) { diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java index d1e63da1610..34ae1cb3fb8 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java @@ -86,15 +86,17 @@ protected void initializeDrivingVariants() { final List bundles = BundleJSON.toBundleList(IOUtils.getStringFromPath(gatkPath), GATKPath::new); for (final Bundle bundle : bundles) { if (bundle.getPrimaryContentType().equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { - // use the bundle primary resource as the FeatureInput URI, and tear off and attach the - // individual bundle the bundle to the FI as the parent bundle so downstream code can - // extract other resources from it on demand - // note that if the original value from the user has a tag, we can't use it unless there - // is only one input, since FIs have to be unique + // use the bundle primary resource as the FeatureInput URI, and tear off and attach + // the enclosing bundle as the parent bundle for the FI so downstream code can extract + // other resources from it on demand. note that if the original value from the user has + // a tag, we can't propagate it unless there is only one input, since FIs have to be + // unique final FeatureInput bundleFI = new FeatureInput<>( new GATKPath(bundle.getPrimaryResource().getIOPath().get().getURIString()), bundle, - bundles.size() > 1 ? gatkPath.getTag() : "drivingVariants" + bundles.size() > 1 ? + gatkPath.getTag() : + "drivingVariants" ); if (drivingVariantsFeatureInputs.contains(bundleFI)) { throw new UserException.BadInput("Feature inputs must be unique: " + gatkPath); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java index cf943b0b62a..bf88a964595 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools; import htsjdk.beta.io.bundle.*; +import htsjdk.beta.plugin.registry.HaploidReferenceResolver; import htsjdk.beta.plugin.variants.VariantsBundle; import htsjdk.samtools.util.FileExtensions; import org.apache.logging.log4j.LogManager; @@ -19,9 +20,175 @@ import java.util.*; /** - * Create a bundle (JSON) file for use with a GATK tool. + *

+ * Create a single bundle (JSON) file for one or more related inputs, for use as input to a GATK tool. + *

Bundle Structure

+ * A bundle is a JSON file that contains references to one or more related resources (i.e., a VCF file + * and it's associated index file, or a .fasta reference file and it's associated index and dictionary files). + * Bundle files can be supplied as inputs for many GATK tools. The primary advantage to using a bundle input + * over an individual file input is that a bundle allows the individual resources to be located in different + * directories (ie., when providing a VCF input, it's corresponding index file generally need to be a sibling + * file in the same directory as the VCF, whereas using a bundle file, because you reference each resource + * explicitly, the index can be located in a different directory from the VCF). + *

+ * Each resource in a bundle has an associated content type, which is a string that identifies the type of data + * in that resource. One resource in the bundle is always designated as the "primary" resource, which determines + * the type of the bundle. * - * other inputs are NEVER inferred, and must always be provided with a content type tag. + *

An example bundle JSON file is show below. The bundle has 3 resources, with content types + * "HAPLOID_REFERENCE", "REFERENCE_DICTIONARY", and "REFERENCE_INDEX" respectively, with the "HAPLOID_REFERENCE" + * resource designated as the primary resource. This bundle is a reference bundle, and can be used as an + * input where ever a reference argument is required. + *

+ *  {
+ *    "schema": {
+ *     "schemaVersion": "0.1.0",
+ *     "schemaName": "htsbundle"
+ *    },
+ *    "HAPLOID_REFERENCE": {"path": "file:///projects/print_reads.fasta"},
+ *    "REFERENCE_DICTIONARY": {"path": "file:///projects/print_reads.dict"},
+ *    "REFERENCE_INDEX": {"path": "file:///projects/print_reads.fasta.fai"},
+ *    "primary": "HAPLOID_REFERENCE"
+ *  }
+ * 
+ *

Using CreateBundle

+ *

+ * CreateBundle requires at least one primary resource input. One or more optional secondary resource inputs + * may also be supplied. + *

+ * The simplest way to use CreateBundle is to specify only the primary resource. In this case, with no secondary + * resources are explicitly provided, CreateBundle will attempt to locate and infer "standard" secondary resources + * (see "Standard Secondary Resources" below) for the primary resource, as long as: + *

    + *
  • the content type of the primary resource can be determined from the resources's file extension
  • + *
  • the primary resource content type is a well known type (i.e., a variants or a fasta file)
  • + *
  • the secondary resources are siblings (in the same parent directory) as the primary resource
  • + *
+ * The "--suppress-resource-resolution" argument can be used to suppress this secondary + * resource inference behavior. If the type of the primary resource is not expliclty provided, or cannot be + * determined from the file extensions, or if the standard secondary resources for a standard primary resource + * cannot be found in the same director as the primary, an exception will be thrown. + *

+ * Alternatively, you can explicitly specify all of the resources and their content types. This is useful when + * the resources are not in the same directory, or when the content types are not standard. + *

+ * For the primary resource, if the content type is not specified on the command line (content types are + * supplied on the command line using argument tags - see "Standard Content Types" and the examples below), + * CreateBundle will attempt to determine the content type of the primary resource. Content types for explicitly + * supplied secondary resources are never inferred, and must always be supplied explicitly. + *

+ * Bundle output file names must end with the suffix ".json". + *

+ *

Standard Content Types

+ * In general, bundle content types can be any string, but many tools expect bundles to use standard, well known + * content types that are pre-defined, such as content types for a VCF, a VCF index, a .fasta file, or a reference + * dictionary file. The common well known content types are: + *

Standard VCF Content Types:

+ *
    + *
  • "CT_VARIANT_CONTEXTS": a VCF file
  • + *
  • "CT_VARIANTS_INDEX": a VCF index file
  • + *
+ *

Standard Reference Content Types

+ *
    + *
  • "CT_HAPLOID_REFERENCE": a fasta reference file
  • + *
  • "CT_HAPLOID_REFERENCE_INDEX": a fasta index file
  • + *
  • "CT_HAPLOID_REFERENCE_DICTIONARY": a fasta dictionary file
  • + *

+ *

Standard Secondary Resources

+ *

For VCFS, the standard (inferred) secondary resources are:

+ *
    + *
  • an index file
  • + *
+ *

For references, the standard secondary resources are:

+ *
    + *
  • an index file
  • + *
  • a dictionary file
  • + *
+ *

Common bundle creation examples:

+ *

+ *

VCF Bundle Examples

+ *

+ * 1) Create a resource bundle for a VCF from just the VCF, letting the tool resolve the secondary (index) resource by + * automatically finding the sibling index file, and letting the tool determine the content types. If the sibling index + * file cannot be found, an exception will be thrown. The resulting bundle contains the VCF and associated index. + *

+ *    CreateBundle \
+ *      --primary path/to/my.vcf \
+ *      --output mybundle.json
+ * 

+ * The exact same bundle could be created manually by specifying both the resources and the content types explicitly: + *

+ *     CreateBundle \
+ *      --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
+ *      --secondary:CT_VARIANTS_INDEX path/to/my.vcf.idx \
+ *      --output mybundle.json
+ * 

+ *

+ * 2) Create a resource bundle for a VCF from just the VCF, but suppress automatic resolution of the secondary + * resources. Let the tool determine the content type. The resulting bundle will contain only the VCF resource: + *

+ *    CreateBundle \
+ *      --primary path/to/my.vcf \
+ *      --suppress-resource-resolution \
+ *      --output mybundle.json
+ * 

+ * 3) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (which + * suppresses automatic secondary resolution). This is useful when the VCF and index are not in the same directory. + * Let the tool determine the primary content type. The resulting bundle will contain the VCF and index resources: + *

+ *     CreateBundle \
+ *      --primary path/to/my.vcf \
+ *      --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
+ *      --output mybundle.json
+ * 

+ * 4) Create a resource bundle for a VCF, but specify the VCF AND the secondary index resource explicitly (this + * is useful when the VCF and index are not in the same directory), and specify the content types explicitly via + * command line argument tags. The resulting bundle will contain the VCF and index resources. + *

+ *     CreateBundle \
+ *      --primary:CT_VARIANT_CONTEXTS path/to/my.vcf \
+ *      --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \
+ *      --output mybundle.json
+ * 

+ *

Reference Bundle Examples

+ *

+ * 1) Create a resource bundle for a reference from just the .fasta, letting the tool resolve the secondary + * (index and dictionary) resource by automatically finding the sibling files, and determining the content types. + * If the sibling index file cannot be found, an exception will be thrown. The resulting bundle will contain the + * reference, index, and dictionary. + *

+ *    CreateBundle \
+ *      --primary path/to/my.fasta \
+ *      --output mybundle.json
+ * 

+ * 2) Create a resource bundle for a reference from just the .fasta, but suppress resolution of the secondary index and + * dictionary resources. Let the tool determine the content type. The resulting bundle will contain only the .fasta + * resource: + *

+ *    CreateBundle \
+ *      --primary path/to/my.fasta \
+ *      --suppress-resource-resolution \
+ *      --output mybundle.json
+ * 

+ * 3) Create a resource bundle for a fasta, but specify the fasta AND the secondary index and dictionary resources + * explicitly (which suppresses automatic secondary resolution). Let the tool determine the content types. The + * resulting bundle will contain the fasta, index and dictionary resources: + *

+ *     CreateBundle \
+ *      --primary path/to/my.fasta \
+ *      --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
+ *      --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
+ *      --output mybundle.json
+ * 

+ * 4) Create a resource bundle for a fasta, but specify the fasta, index and dictionary resources and the content + * types explicitly. The resulting bundle will contain the fasta, index and dictionary resources: + *

+ *     CreateBundle \
+ *      --primary:CT_HAPLOID_REFERENCE path/to/my.fasta \
+ *      --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
+ *      --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
+ *      --output mybundle.json
+ * 

*/ @DocumentedFeature @CommandLineProgramProperties( @@ -32,51 +199,53 @@ public class CreateBundle extends CommandLineProgram { protected static final Logger logger = LogManager.getLogger(CreateBundle.class); - public static final String SUPPRESS_INDEX_RESOLUTION_FULL_NAME = "suppress-index-resolution"; - public static final String OTHER_INPUT_FULL_NAME = "other-input"; + @Argument(fullName = StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME, + shortName = StandardArgumentDefinitions.PRIMARY_RESOURCE_SHORT_NAME, + doc="Path to the primary resource (content type will be inferred if no content type tag is specified)") + GATKPath primaryResource; - @Argument(fullName = StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.PRIMARY_INPUT_SHORT_NAME, - doc="Path to the primary bundle input (content type will be inferred if no content type tag is specified)") - GATKPath primaryInput; - - @Argument(fullName = StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.SECONDARY_INPUT_SHORT_NAME, - doc = "Path to a secondary bundle input for" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME - + "(usually an index). The type will be inferred if no content type tag is specified. If no "+ - " secondary input is specified, an index for the primary bundle file will be automatically inferred unless " + - SUPPRESS_INDEX_RESOLUTION_FULL_NAME + " is specified.", + @Argument(fullName = StandardArgumentDefinitions.SECONDARY_RESOURCE_LONG_NAME, + shortName = StandardArgumentDefinitions.SECONDARY_RESOURCE_SHORT_NAME, + doc = "Path to a secondary resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + + " (usually an index). If no secondary resource is specified, standard secondary resources" + + " for the primary bundle resource will be automatically inferred unless " + + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME + + " is specified.", optional = true) - GATKPath secondaryInput; + GATKPath secondaryResource; - @Argument(fullName = OTHER_INPUT_FULL_NAME, - shortName = OTHER_INPUT_FULL_NAME, - doc = "Path to other bundle inputs for " + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME - + " A content type tag MUST be provided for each other input.", + public static final String OTHER_RESOURCE_FULL_NAME = "other-resource"; + @Argument(fullName = OTHER_RESOURCE_FULL_NAME, + shortName = OTHER_RESOURCE_FULL_NAME, + doc = "Path to other bundle resources for " + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + + ". The content is not inferred for \"other\" (non-primary and non-secondary) resources, so a" + + " content type tag MUST be provided for each \"other\" resource.", optional = true) - List otherInputs; + List otherResources; - @Argument(fullName = SUPPRESS_INDEX_RESOLUTION_FULL_NAME, - doc ="Don't attempt to resolve the primary input's index file (defaults to false) if no secondary input is provided", + public static final String SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME = "suppress-resource-resolution"; + @Argument(fullName = SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME, + doc ="Don't attempt to resolve the primary resource's secondary resources if no secondary resource is explicitly" + + " provided (defaults to false) ", optional = true) - boolean suppressIndexResolution = false; + boolean suppressResourceResolution = false; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "Path the output bundle file (must end with the suffix .json.") + doc = "Path the output bundle file (must end with the suffix " + BundleJSON.BUNDLE_EXTENSION + ")") GATKPath outputBundlePath; private enum BundleType { VCF, REFERENCE, - OTHER + CUSTOM } private BundleType outputBundleType; @Override protected String[] customCommandLineValidation() { - if (!outputBundlePath.toString().endsWith(".json")){ - return new String[]{"Output bundle path must end with the suffix .json"}; + if (!outputBundlePath.toString().endsWith(BundleJSON.BUNDLE_EXTENSION)){ + return new String[]{ String.format("Output bundle path must end with the suffix %s", BundleJSON.BUNDLE_EXTENSION) }; } outputBundleType = determinePrimaryContentType(); return super.customCommandLineValidation(); @@ -87,9 +256,8 @@ protected Object doWork() { try (final BufferedWriter writer = Files.newBufferedWriter(outputBundlePath.toPath(), StandardOpenOption.CREATE)) { final Bundle bundle = switch (outputBundleType) { case VCF -> createVCFBundle(); - case REFERENCE -> - throw new IllegalArgumentException ("Reference bundles are not yet supported"); - case OTHER -> createOtherBundle(); + case REFERENCE -> createHaploidReferenceBundle(); + case CUSTOM -> createOtherBundle(); }; writer.write(BundleJSON.toJSON(bundle)); } catch (final IOException e) { @@ -102,63 +270,63 @@ private BundleType determinePrimaryContentType() { BundleType bundleType; // determine the type of bundle to create; consult the tag attributes if any, otherwise try to infer from the - // primary input file extension - final String primaryContentTag = primaryInput.getTag(); - if (primaryContentTag != null && !primaryContentTag.isEmpty()) { + // primary resource's file extension + final String primaryContentTag = primaryResource.getTag(); + if (primaryContentTag != null) { if (primaryContentTag.equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { bundleType = BundleType.VCF; } else if (primaryContentTag.equals(BundleResourceType.CT_HAPLOID_REFERENCE)) { bundleType = BundleType.REFERENCE; } else { - logger.info(String.format("Primary input content type %s for %s not recognized. A bundle will be created using content typse from the provided argument tags.", + logger.info(String.format("Primary input content type %s for %s not recognized. A bundle will be created using content types from the provided argument tags.", primaryContentTag, - primaryInput)); - bundleType = BundleType.OTHER; + primaryResource)); + bundleType = BundleType.CUSTOM; } } else { - logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the content type from the %s extension.", primaryInput)); - bundleType = inferPrimaryContentType(primaryInput); + logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the primary content type from the %s extension.", primaryResource)); + bundleType = inferPrimaryContentType(primaryResource); } return bundleType; } - private BundleType inferPrimaryContentType(final GATKPath primaryInput) { + private BundleType inferPrimaryContentType(final GATKPath primaryResource) { logger.info("Attempting to infer bundle content type from file extension."); - if (FileExtensions.VCF_LIST.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + if (FileExtensions.VCF_LIST.stream().anyMatch(ext -> primaryResource.hasExtension(ext))) { return BundleType.VCF; - } else if (FileExtensions.FASTA.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + } else if (FileExtensions.FASTA.stream().anyMatch(ext -> primaryResource.hasExtension(ext))) { return BundleType.REFERENCE; } else { - throw new IllegalArgumentException(String.format("Unable to infer bundle content type from file extension %s. A content type must be provided as part of the argument.", primaryInput)); + throw new IllegalArgumentException(String.format("Unable to infer bundle content type from file extension %s. A content type must be provided as part of the argument.", primaryResource)); } } private Bundle createVCFBundle() { final Collection bundleResources = new ArrayList<>(); - bundleResources.add(new IOPathResource(primaryInput, BundleResourceType.CT_VARIANT_CONTEXTS)); - if (secondaryInput != null) { - final String secondaryContentType = secondaryInput.getTag(); + bundleResources.add(new IOPathResource(primaryResource, BundleResourceType.CT_VARIANT_CONTEXTS)); + if (secondaryResource != null) { + final String secondaryContentType = secondaryResource.getTag(); if (secondaryContentType == null) { - logger.info(String.format("A content type for the secondary input was not provided. Assuming %s is an index.", secondaryInput)); - bundleResources.add(new IOPathResource(secondaryInput, BundleResourceType.CT_VARIANTS_INDEX)); + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); } else { - bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); } - } else if (!suppressIndexResolution) { - // secondary input is null, and index resolution suppression is off - final Optional indexPath = VariantsBundle.resolveIndex(primaryInput, GATKPath::new); + } else if (!suppressResourceResolution) { + // secondary resources is null, and resource resolution suppression is off + final Optional indexPath = VariantsBundle.resolveIndex(primaryResource, GATKPath::new); if (indexPath.isEmpty()) { throw new IllegalArgumentException( String.format( "Could not infer an index for %s, you must either specify the index path as a secondary input on the command line or specify the %s argument.", - primaryInput.getRawInputString(), - SUPPRESS_INDEX_RESOLUTION_FULL_NAME)); + primaryResource.getRawInputString(), + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME)); } bundleResources.add(new IOPathResource(indexPath.get(), BundleResourceType.CT_VARIANTS_INDEX)); } - if (otherInputs != null) { - for (final GATKPath otherInput : otherInputs) { + if (otherResources != null && !otherResources.isEmpty()) { + for (final GATKPath otherInput : otherResources) { final String otherContentType = otherInput.getTag(); if (otherContentType == null) { throw new IllegalArgumentException( @@ -173,19 +341,53 @@ private Bundle createVCFBundle() { return new VariantsBundle(bundleResources); } + private Bundle createHaploidReferenceBundle() { + final Collection bundleResources = new ArrayList<>(); + if (secondaryResource == null && (otherResources == null || otherResources.isEmpty())) { + // infer dictionary and index + return HaploidReferenceResolver.referenceBundleFromFastaPath(primaryResource, GATKPath::new); + } + + bundleResources.add(new IOPathResource(primaryResource, primaryResource.getTag())); + if (secondaryResource != null) { + final String secondaryContentType = secondaryResource.getTag(); + if (secondaryContentType == null) { + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); + } + } + if (otherResources != null && !otherResources.isEmpty()) { + for (final GATKPath otherInput : otherResources) { + final String otherContentType = otherInput.getTag(); + if (otherContentType == null) { + throw new IllegalArgumentException( + String.format( + "A content type must be provided for \"other\" input %s.", + otherInput.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(otherInput, otherContentType)); + } + } + } + return new Bundle(primaryResource.getTag(), bundleResources); + } + private Bundle createOtherBundle() { final Collection bundleResources = new ArrayList<>(); - bundleResources.add(new IOPathResource(primaryInput, primaryInput.getTag())); - if (secondaryInput != null) { - final String secondaryContentType = secondaryInput.getTag(); + bundleResources.add(new IOPathResource(primaryResource, primaryResource.getTag())); + if (secondaryResource != null) { + final String secondaryContentType = secondaryResource.getTag(); if (secondaryContentType == null) { - throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided.")); + throw new IllegalArgumentException(String.format("A content type for the secondary input %s must be provided.", + secondaryResource.getRawInputString())); } else { - bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + bundleResources.add(new IOPathResource(secondaryResource, secondaryContentType)); } } - if (otherInputs != null) { - for (final GATKPath otherInput : otherInputs) { + if (otherResources != null && !otherResources.isEmpty()) { + for (final GATKPath otherInput : otherResources) { final String otherContentType = otherInput.getTag(); if (otherContentType == null) { throw new IllegalArgumentException( @@ -197,6 +399,6 @@ private Bundle createOtherBundle() { } } } - return new Bundle(primaryInput.getTag(), bundleResources); + return new Bundle(primaryResource.getTag(), bundleResources); } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/fasta/CachingIndexedFastaSequenceFile.java b/src/main/java/org/broadinstitute/hellbender/utils/fasta/CachingIndexedFastaSequenceFile.java index 6cd2d678347..88b07140257 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -1,5 +1,9 @@ package org.broadinstitute.hellbender.utils.fasta; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.IOUtils; +import htsjdk.io.IOPath; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; @@ -143,14 +147,24 @@ public CachingIndexedFastaSequenceFile(final Path fasta, boolean preserveAmbigui * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns */ public CachingIndexedFastaSequenceFile(final Path fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { - // Check the FASTA path: - checkFastaPath(fasta); Utils.validate(cacheSize > 0, () -> "Cache size must be > 0 but was " + cacheSize); // Read reference data by creating an IndexedFastaSequenceFile. try { - final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta, true, true); - sequenceFile = requireIndex(fasta, referenceSequenceFile); + final IOPath fastaPath = new GATKPath(fasta.toUri().toString()); + if (fastaPath.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + final Bundle referenceBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(fastaPath), GATKPath::new); + final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFileFromBundle( + referenceBundle, + true, + true); + sequenceFile = requireIndex(fasta, referenceSequenceFile); + } else { + // Check the FASTA path: + checkFastaPath(fasta); + final ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta, GATKPath::new, true, true); + sequenceFile = requireIndex(fasta, referenceSequenceFile); + } this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.preserveCase = preserveCase; @@ -159,9 +173,6 @@ public CachingIndexedFastaSequenceFile(final Path fasta, final long cacheSize, f catch (final IllegalArgumentException e) { throw new UserException.CouldNotReadInputFile(fasta, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); } - catch (final Exception e) { - throw new UserException.CouldNotReadInputFile(fasta, e); - } } /** diff --git a/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java new file mode 100644 index 00000000000..04566869e0d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/engine/BundleSupportIntegrationTest.java @@ -0,0 +1,34 @@ +package org.broadinstitute.hellbender.engine; + +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.io.IOPath; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.IOException; + +public class BundleSupportIntegrationTest extends GATKBaseTest { + + // this test uses a serialized bundle file to ensure that we don't unintentionally pick up any + // code (like, from htsjdk) that introduces backward compatibility issues + @Test + public void testReadWriteSerializedReferenceBundle() throws IOException { + // This test file contains absolute paths to files on a local dev machine, so it shouldn't really be used + // for anything other than this test, since the absolute paths are unlikely to work on any other machine. + // But here we just want to make sure we can consume and roundtrip it without error + final IOPath testBundleFilePath = new GATKPath("src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json"); + + // get our test bundle from the file (ensure we canparse it), then write it out to a temp file, read it back + // in, and compare + final Bundle testBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath)); + final IOPath roundTrippedBundleFilePath = new GATKPath( + createTempPath("testReadWriteSerializedReferenceBundle", ".json").toString()); + IOPathUtils.writeStringToPath(roundTrippedBundleFilePath, BundleJSON.toJSON(testBundle)); + final Bundle roundTrippedBundle = BundleJSON.toBundle(IOPathUtils.getStringFromPath(testBundleFilePath)); + Assert.assertTrue(Bundle.equalsIgnoreOrder(roundTrippedBundle, testBundle)); + } + +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java index 315fbe3458d..99f135f273a 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java @@ -167,8 +167,7 @@ private static IOPath createRemoteBundleForFile( final File index1, final File vcf2, final File index2) throws IOException { - //TODO: replace this path with getGCPTestStaging() - final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir"); + final String remotePath = BucketUtils.randomRemotePath(getGCPTestStaging() + "remoteBundles", "remote_bundle_test", "dir"); final Path remoteDirPath = IOUtils.getPath(remotePath + "/"); Files.createDirectory(remoteDirPath); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/AbstractPrintReadsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/AbstractPrintReadsIntegrationTest.java index 160d2e43380..220704744a8 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/AbstractPrintReadsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/AbstractPrintReadsIntegrationTest.java @@ -1,5 +1,9 @@ package org.broadinstitute.hellbender.tools; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.registry.HaploidReferenceResolver; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; @@ -9,6 +13,7 @@ import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.ReadsPathDataSource; import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter; @@ -19,6 +24,7 @@ import org.broadinstitute.hellbender.testutils.SamAssertionUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -37,14 +43,20 @@ public void doFileToFile(String fileIn, String extOut, String reference, boolean String samFile = fileIn; final File outFile = GATKBaseTest.createTempFile(samFile + ".", extOut); final File ORIG_BAM = new File(TEST_DATA_DIR, samFile); - final File refFile; + final GATKPath refFile; final ArrayList args = new ArrayList<>(); args.add("--input"); args.add(ORIG_BAM.getAbsolutePath()); args.add("--output"); args.add(outFile.getAbsolutePath()); if (reference != null) { - refFile = new File(TEST_DATA_DIR, reference); - args.add("-R"); args.add(refFile.getAbsolutePath()); + if (reference.endsWith(BundleJSON.BUNDLE_EXTENSION)) { + // the test json files are temporary files, not files in TEST_DATA_DIR + refFile = new GATKPath(reference); + args.add("-R"); args.add(reference); + } else { + refFile = new GATKPath(new File(TEST_DATA_DIR, reference).getAbsolutePath()); + args.add("-R"); args.add(refFile.toString()); + } } else { refFile = null; @@ -55,13 +67,33 @@ public void doFileToFile(String fileIn, String extOut, String reference, boolean } runCommandLine(args); - SamAssertionUtils.assertSamsEqual(outFile, ORIG_BAM, refFile); + SamAssertionUtils.assertSamsEqual(outFile, ORIG_BAM, refFile == null ? null : refFile.toPath().toFile()); if (testMD5) { checkMD5asExpected(outFile); } } + public void doFileToFileUsingReferenceBundle(String fileIn, String extOut, String reference, boolean testMD5) throws Exception { + final String referenceToUse; + if (reference != null) { + // create the bundle, using inference to find the sibling files, then write the bundle out to a temp file + final Bundle referenceBundle = HaploidReferenceResolver.referenceBundleFromFastaPath( + new GATKPath(new File(TEST_DATA_DIR, reference).toPath().toString()), + GATKPath::new); + final GATKPath tempBundlePath = new GATKPath( + IOUtils.createTempFile("printReadsRefBundle", ".json").getAbsolutePath() + ); + IOPathUtils.writeStringToPath(tempBundlePath, BundleJSON.toJSON(referenceBundle)); + referenceToUse = tempBundlePath.toString(); + } else { + referenceToUse = reference; + } + + // no run the regular test, but using the reference bundle + doFileToFile(fileIn, extOut, referenceToUse, testMD5); + } + private void checkMD5asExpected(final File outFile) throws IOException { final File md5File = new File(outFile.getAbsolutePath() + ".md5"); if (md5File.exists()) { @@ -74,8 +106,8 @@ private void checkMD5asExpected(final File outFile) throws IOException { } @Test(dataProvider="testingData") - public void testFileToFile(String fileIn, String extOut, String reference) throws Exception { - doFileToFile(fileIn, extOut, reference, false); + public void testFileToFileWithReferenceBundle(String fileIn, String extOut, String reference) throws Exception { + doFileToFileUsingReferenceBundle(fileIn, extOut, reference, false); } @DataProvider(name="testingData") @@ -120,6 +152,18 @@ public Object[][] testingData() { }; } + @Test(dataProvider="testingData") + public void testFileToFileUsingReferenceBundle(String fileIn, String extOut, String reference) throws Exception { + if (reference != null) { + doFileToFileUsingReferenceBundle(fileIn, extOut, reference, false); + } + } + + @Test(dataProvider="testingData") + public void testFileToFile(String fileIn, String extOut, String reference) throws Exception { + doFileToFile(fileIn, extOut, reference, false); + } + @Test public void testReadThatConsumesNoReferenceBases() throws IOException { final File zeroRefBasesReadBam = new File(TEST_DATA_DIR, "read_consumes_zero_ref_bases.bam"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java index a39cfd450f4..d9904cdeaab 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java @@ -3,6 +3,7 @@ import htsjdk.beta.io.bundle.*; import htsjdk.beta.plugin.IOUtils; import htsjdk.beta.plugin.variants.VariantsBundle; +import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.GATKPath; @@ -17,74 +18,102 @@ public class CreateBundleIntegrationTest extends CommandLineProgramTest { // force our local paths to use absolute path names to make BundleResource and IOPath equality checks easier, - // since all serialized JSON bundles always contain absolute path names for local files + // since once a bundle is round-tripped/serialized to JSON, the resources will always contain absolute path names + // for local files + + //NOTE: These variables are Strings, but they are initialized to Strings obtained by first creating a GATKPath, + // and then calling getURIString on the resulting object. This is just shortcut to normalize them so they + // match the strings that will be embedded in the bundles created by the CreateBundle tool (i.e., to have + // full/absolute paths and protocol schemes). private final static String LOCAL_VCF = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf").getURIString(); private final static String LOCAL_VCF_IDX = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf.idx").getURIString(); private final static String LOCAL_VCF_GZIP = new GATKPath("src/test/resources/large/NA24385.vcf.gz").getURIString(); private final static String LOCAL_VCF_TBI = new GATKPath("src/test/resources/large/NA24385.vcf.gz.tbi").getURIString(); private final static String LOCAL_VCF_WITH_NO_INDEX = new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/count_variants_withSequenceDict_noIndex.vcf").getURIString(); - private final static String CLOUD_VCF = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; - private final static String CLOUD_VCF_IDX = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; + private final static String CLOUD_VCF = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; + private final static String CLOUD_VCF_IDX = GCS_GATK_TEST_RESOURCES + "large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; + + private final static String LOCAL_FASTA = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta").getURIString(); + private final static String LOCAL_FASTA_INDEX = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.fasta.fai").getURIString(); + private final static String LOCAL_FASTA_DICT = new GATKPath("src/test/resources/large/Homo_sapiens_assembly38.20.21.dict").getURIString(); - private final static String PRIMARY_CT = "primary_ct"; - private final static String SECONDARY_CT = "secondary_ct"; - private final static String OTHER_CT = "other_ct"; + private final static String CUSTOM_PRIMARY_CT = "primary_ct"; + private final static String CUSTOM_SECONDARY_CT = "secondary_ct"; + private final static String CUSTOM_OTHER_CT = "other_ct"; @DataProvider(name = "bundleCases") public Object[][] bundleCases() { return new Object[][] { - // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle + // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressResourceResolution, expectedBundle - // common VCF bundle cases, with inferred content types - {LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + // VCF bundle cases, with AUTOMATIC secondary resolution, and INFERRED primary content types {LOCAL_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, {LOCAL_VCF_GZIP, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, - {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + // VCF bundle cases, with SUPPRESSED secondary resolution, and INFERRED primary content types + {LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + // local vcf that has no index, but since suppressSecondaryResourceResolution is true, we don't throw since we don't try to infer the index + {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, {CLOUD_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF))}, - {CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, - {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, - {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + // VCF bundle cases, with AUTOMATIC secondary resolution, and EXPLICIT primary content types + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - // common vcf bundle cases, with explicit content types + // VCF bundle cases, with SUPPRESSED secondary resolution, and EXPLICIT content types {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, - {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, - // vcf bundle with a vcf, an index, and some other resource with an explicit content type + // vcf bundle with a vcf, an index, and some other custom resource with an explicit content type {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, Arrays.asList("someVariantsCompanion.txt"), Arrays.asList("someVariantsCT"), false, new BundleBuilder() .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), BundleResourceType.CT_VARIANT_CONTEXTS)) .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), BundleResourceType.CT_VARIANTS_INDEX)) .addSecondary(new IOPathResource(new GATKPath(new GATKPath("someVariantsCompanion.txt").getURIString()), "someVariantsCT")) - .build()}, + .build() + }, - // "other" bundles + // reference bundles + { LOCAL_FASTA, null, null, null, null, null, false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build() + }, + { LOCAL_FASTA, BundleResourceType.CT_HAPLOID_REFERENCE, LOCAL_FASTA_INDEX, BundleResourceType.CT_REFERENCE_INDEX, Arrays.asList(LOCAL_FASTA_DICT), Arrays.asList(BundleResourceType.CT_REFERENCE_DICTIONARY), false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_FASTA), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_INDEX), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_FASTA_DICT), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build() + }, + + // "custom" bundles { - LOCAL_VCF, PRIMARY_CT, null, null, null, null, true, + LOCAL_VCF, CUSTOM_PRIMARY_CT, null, null, null, null, true, new BundleBuilder() - .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), CUSTOM_PRIMARY_CT)) .build() }, { - LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, null, null, true, + LOCAL_VCF, CUSTOM_PRIMARY_CT, LOCAL_VCF_IDX, CUSTOM_SECONDARY_CT, null, null, true, new BundleBuilder() - .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) - .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), CUSTOM_PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), CUSTOM_SECONDARY_CT)) .build() }, { // frankenbundle with multiple resources - LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, Arrays.asList(LOCAL_VCF_TBI), Arrays.asList(OTHER_CT), true, + LOCAL_VCF, CUSTOM_PRIMARY_CT, LOCAL_VCF_IDX, CUSTOM_SECONDARY_CT, Arrays.asList(LOCAL_VCF_TBI), Arrays.asList(CUSTOM_OTHER_CT), true, new BundleBuilder() - .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) - .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) - .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_TBI), OTHER_CT)) + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), CUSTOM_PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), CUSTOM_SECONDARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_TBI), CUSTOM_OTHER_CT)) .build() }, }; @@ -95,17 +124,26 @@ public Object[][] negativeBundleCases() { return new Object[][] { // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle - // no index file can be inferred + // no vcf index file can be inferred {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, - // primary content type not provided and cannot be inferred from the extension + // vcf bundle with secondary/other content type not explicitly provided + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + // primary content type not provided, and cannot be inferred from the extension {"primaryFile.ext", null, null, null, null, null, false, null}, // secondary content type not provided - {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null}, + {"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null}, + + // reference input with unknown content type specified + { LOCAL_FASTA, null, LOCAL_FASTA_INDEX, "unknown", null, null, false, null}, // other bundle with other content type not provided - {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", SECONDARY_CT, Arrays.asList("other.txt"), null, false, null}, - // vcf bundle with other content type not provided - {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + {"primaryFile.ext", CUSTOM_PRIMARY_CT, "secondaryFile.ext", CUSTOM_SECONDARY_CT, Arrays.asList("other.txt"), null, false, null}, + }; } @@ -117,9 +155,9 @@ public void testBundleCases( final String secondaryInputTag, final List otherInputs, final List otherInputTags, - final boolean resolveIndex, + final boolean suppressResourceResolution, final Bundle expectedBundle) { - doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, suppressResourceResolution, expectedBundle); } @Test(dataProvider = "negativeBundleCases", expectedExceptions = IllegalArgumentException.class) @@ -130,9 +168,20 @@ public void testNegativeBundleCases( final String secondaryInputTag, final List otherInputs, final List otherInputTags, - final boolean resolveIndex, + final boolean suppressResourceResolution, final Bundle expectedBundle) { - doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, suppressResourceResolution, expectedBundle); + } + + @Test(expectedExceptions={CommandLineException.class}) + public void testRequireBundleExtension() { + final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.BOGUS").getAbsolutePath().toString()); + final List args = new ArrayList<>(); + args.add("--" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME); + args.add(LOCAL_FASTA); + args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); + args.add(outputPath.toString()); + runCommandLine(args); } private void doCreateBundleTest( @@ -142,26 +191,26 @@ private void doCreateBundleTest( final String secondaryInputTag, final List otherInputs, final List otherInputTags, - final boolean suppressIndexResolution, + final boolean suppressResourceResolution, final Bundle expectedBundle) { final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.json").getAbsolutePath().toString()); final List args = new ArrayList<>(); - args.add("--" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + (primaryInputTag != null ? ":" + primaryInputTag : "")); + args.add("--" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + (primaryInputTag != null ? ":" + primaryInputTag : "")); args.add(primaryInput); if (secondaryInput != null) { - args.add("--" + StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME + (secondaryInputTag != null ? ":" + secondaryInputTag : "")); + args.add("--" + StandardArgumentDefinitions.SECONDARY_RESOURCE_LONG_NAME + (secondaryInputTag != null ? ":" + secondaryInputTag : "")); args.add(secondaryInput); } if (otherInputs != null) { for (int i = 0; i < otherInputs.size(); i++) { - args.add("--" + CreateBundle.OTHER_INPUT_FULL_NAME + ((otherInputTags != null && otherInputTags.get(i) != null) ? ":" + otherInputTags.get(i) : "")); + args.add("--" + CreateBundle.OTHER_RESOURCE_FULL_NAME + ((otherInputTags != null && otherInputTags.get(i) != null) ? ":" + otherInputTags.get(i) : "")); args.add(otherInputs.get(i) != null ? otherInputs.get(i) : ""); } } - if (suppressIndexResolution == true) { - args.add("--" + CreateBundle.SUPPRESS_INDEX_RESOLUTION_FULL_NAME); + if (suppressResourceResolution == true) { + args.add("--" + CreateBundle.SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME); } args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); args.add(outputPath.toString()); @@ -170,7 +219,7 @@ private void doCreateBundleTest( final Bundle actualBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(outputPath), GATKPath::new); - // bundle resource order is not preserved when roundtripping through JSON, so compare ignoring order + // bundle resource order is not preserved when round-tripping through JSON, so compare ignoring order Assert.assertTrue(Bundle.equalsIgnoreOrder(actualBundle, expectedBundle)); } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java index 028b5965efc..5fb7702b797 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/PrintReadsIntegrationTest.java @@ -1,9 +1,13 @@ package org.broadinstitute.hellbender.tools; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.*; +import htsjdk.io.IOPath; +import htsjdk.samtools.*; +import htsjdk.samtools.cram.ref.ReferenceSource; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.ReadsPathDataSource; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; @@ -11,12 +15,15 @@ import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Arrays; import java.util.List; @@ -104,4 +111,50 @@ public void testHttpPaths(String reads, String index, String nonHttpReads, Strin SamAssertionUtils.assertEqualBamFiles(out, out2, false, ValidationStringency.DEFAULT_STRINGENCY); } + @Test(groups = {"cloud"}) + public void testPrintReadsWithReferenceBundle() throws IOException { + // test that both reading and writing a cram work when the reference is specified via a bundle where the + // reference, reference index, and reference dictionary are all in different buckets + final IOPath testFastaFile = new GATKPath(getTestDataDir() + "/print_reads.fasta"); + final IOPath testIndexFile = new GATKPath(getTestDataDir() + "/print_reads.fasta.fai"); + final IOPath testDictFile = new GATKPath(getTestDataDir() + "/print_reads.dict"); + + final String targetBucketName = BucketUtils.randomRemotePath(getGCPTestStaging(), "testPrintReadsWithReferenceBundle", "") + "/"; + final IOPath targetBucket = new GATKPath(targetBucketName); + IOUtils.deleteOnExit(targetBucket.toPath()); + + final Path remoteFasta = Files.copy(testFastaFile.toPath(), new GATKPath(targetBucketName + "print_reads.fasta").toPath()); + final IOPath targetIndex = new GATKPath(targetBucketName + "refindex/print_reads.fasta.fai"); + final Path remoteFastaIndex = Files.copy(testIndexFile.toPath(), targetIndex.toPath()); + final IOPath targetDict = new GATKPath(targetBucketName + "refdict/print_reads.dict"); + final Path remoteFastaDict = Files.copy(testDictFile.toPath(), targetDict.toPath()); + + // create a bundle with the remote reference, index, and dict files + final Bundle refBundle = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteFasta.toUri().toString()), BundleResourceType.CT_HAPLOID_REFERENCE)) + .addSecondary(new IOPathResource(new GATKPath(remoteFastaIndex.toUri().toString()), BundleResourceType.CT_REFERENCE_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(remoteFastaDict.toUri().toString()), BundleResourceType.CT_REFERENCE_DICTIONARY)) + .build(); + final IOPath bundleFilePath = new GATKPath(targetBucketName + "refBundle.json"); + IOPathUtils.writeStringToPath(bundleFilePath, BundleJSON.toJSON(refBundle)); + + final IOPath targetOutCRAM = new GATKPath(IOUtils.createTempFile("testReferenceSequenceForNioBundle", ".cram").getAbsolutePath()); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addInput(getTestDataDir() + "/print_reads.cram") + .addReference(bundleFilePath.toString()) + .addOutput(targetOutCRAM.toString()); + runCommandLine(args); + + int count = 0; + try (final SamReader in = SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .referenceSource(new ReferenceSource(bundleFilePath.toPath())) + .open(targetOutCRAM.toPath())) { + for (@SuppressWarnings("unused") final SAMRecord rec : in) { + count++; + } + } + Assert.assertEquals(count, 8); + } + } \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json new file mode 100644 index 00000000000..66fe7bf0ecb --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/engine/print_reads_bundle_do_not_use.json @@ -0,0 +1,10 @@ +{ + "schema": { + "schemaVersion": "0.1.0", + "schemaName": "htsbundle" + }, + "HAPLOID_REFERENCE": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta"}, + "REFERENCE_DICTIONARY": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.dict"}, + "REFERENCE_INDEX": {"path": "file:///Users/cnorman/projects/gatk/src/test/resources/org/broadinstitute/hellbender/tools/print_reads.fasta.fai"}, + "primary": "HAPLOID_REFERENCE" +} From 1609281694d55f0e54ae4484c45a1bcf3f85d7b7 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Wed, 4 Sep 2024 09:32:17 -0400 Subject: [PATCH 3/4] FIx up javadoc. --- .../hellbender/tools/CreateBundle.java | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java index bf88a964595..520e32d764a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -22,7 +22,7 @@ /** *

* Create a single bundle (JSON) file for one or more related inputs, for use as input to a GATK tool. - *

Bundle Structure

+ *

Bundle File Structure

* A bundle is a JSON file that contains references to one or more related resources (i.e., a VCF file * and it's associated index file, or a .fasta reference file and it's associated index and dictionary files). * Bundle files can be supplied as inputs for many GATK tools. The primary advantage to using a bundle input @@ -34,8 +34,8 @@ * Each resource in a bundle has an associated content type, which is a string that identifies the type of data * in that resource. One resource in the bundle is always designated as the "primary" resource, which determines * the type of the bundle. - * - *

An example bundle JSON file is show below. The bundle has 3 resources, with content types + *

+ * An example bundle JSON file is show below. The bundle has 3 resources, with content types * "HAPLOID_REFERENCE", "REFERENCE_DICTIONARY", and "REFERENCE_INDEX" respectively, with the "HAPLOID_REFERENCE" * resource designated as the primary resource. This bundle is a reference bundle, and can be used as an * input where ever a reference argument is required. @@ -51,7 +51,7 @@ * "primary": "HAPLOID_REFERENCE" * } * - *

Using CreateBundle

+ *

Using CreateBundle

*

* CreateBundle requires at least one primary resource input. One or more optional secondary resource inputs * may also be supplied. @@ -79,7 +79,7 @@ *

* Bundle output file names must end with the suffix ".json". *

- *

Standard Content Types

+ *

Standard Content Types

* In general, bundle content types can be any string, but many tools expect bundles to use standard, well known * content types that are pre-defined, such as content types for a VCF, a VCF index, a .fasta file, or a reference * dictionary file. The common well known content types are: @@ -88,25 +88,25 @@ *
  • "CT_VARIANT_CONTEXTS": a VCF file
  • *
  • "CT_VARIANTS_INDEX": a VCF index file
  • * - *

    Standard Reference Content Types

    + *

    Standard Reference Content Types:

    *
      *
    • "CT_HAPLOID_REFERENCE": a fasta reference file
    • *
    • "CT_HAPLOID_REFERENCE_INDEX": a fasta index file
    • *
    • "CT_HAPLOID_REFERENCE_DICTIONARY": a fasta dictionary file
    • *

    - *

    Standard Secondary Resources

    - *

    For VCFS, the standard (inferred) secondary resources are:

    + *

    Standard Secondary Resources

    + *

    VCF standard secondary resources:

    *
      *
    • an index file
    • *
    - *

    For references, the standard secondary resources are:

    + *

    FASTA reference standard secondary resources:

    *
      *
    • an index file
    • *
    • a dictionary file
    • *
    - *

    Common bundle creation examples:

    + *

    Bundle Creation Examples

    *

    - *

    VCF Bundle Examples

    + *

    VCF Bundle Examples

    *

    * 1) Create a resource bundle for a VCF from just the VCF, letting the tool resolve the secondary (index) resource by * automatically finding the sibling index file, and letting the tool determine the content types. If the sibling index @@ -150,7 +150,7 @@ * --secondary:CT_VARIANTS_INDEX some/other/path/to/vcd.idx \ * --output mybundle.json *

    - *

    Reference Bundle Examples

    + *

    Reference Bundle Examples

    *

    * 1) Create a resource bundle for a reference from just the .fasta, letting the tool resolve the secondary * (index and dictionary) resource by automatically finding the sibling files, and determining the content types. @@ -179,7 +179,7 @@ * --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \ * --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \ * --output mybundle.json - *

    + * * 4) Create a resource bundle for a fasta, but specify the fasta, index and dictionary resources and the content * types explicitly. The resulting bundle will contain the fasta, index and dictionary resources: *

    @@ -188,7 +188,7 @@
      *      --secondary:CT_HAPLOID_REFERENCE_INDEX some/other/path/to/my.fai \
      *      --secondary:CT_HAPLOID_REFERENCE_DICTIONARY some/other/path/to/my.dict \
      *      --output mybundle.json
    - * 

    + * */ @DocumentedFeature @CommandLineProgramProperties( From 735b2ecd006353687b64cabdac1aa046850bb333 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Wed, 4 Sep 2024 09:42:18 -0400 Subject: [PATCH 4/4] More text cleanup. --- .../hellbender/tools/CreateBundle.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java index 520e32d764a..8c2567c3309 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -27,9 +27,9 @@ * and it's associated index file, or a .fasta reference file and it's associated index and dictionary files). * Bundle files can be supplied as inputs for many GATK tools. The primary advantage to using a bundle input * over an individual file input is that a bundle allows the individual resources to be located in different - * directories (ie., when providing a VCF input, it's corresponding index file generally need to be a sibling - * file in the same directory as the VCF, whereas using a bundle file, because you reference each resource - * explicitly, the index can be located in a different directory from the VCF). + * directories (ie., when providing a VCF input, the corresponding index file generally needs to be a sibling + * file in the same directory as the VCF, whereas using a bundle file, because each resource is specified + * explicitly, the index file can be located in a different directory than the VCF). *

    * Each resource in a bundle has an associated content type, which is a string that identifies the type of data * in that resource. One resource in the bundle is always designated as the "primary" resource, which determines @@ -57,19 +57,19 @@ * may also be supplied. *

    * The simplest way to use CreateBundle is to specify only the primary resource. In this case, with no secondary - * resources are explicitly provided, CreateBundle will attempt to locate and infer "standard" secondary resources + * resources are explicitly provided, CreateBundle will attempt to locate and resolve "standard" secondary resources * (see "Standard Secondary Resources" below) for the primary resource, as long as: *

      *
    • the content type of the primary resource can be determined from the resources's file extension
    • *
    • the primary resource content type is a well known type (i.e., a variants or a fasta file)
    • *
    • the secondary resources are siblings (in the same parent directory) as the primary resource
    • *
    - * The "--suppress-resource-resolution" argument can be used to suppress this secondary - * resource inference behavior. If the type of the primary resource is not expliclty provided, or cannot be - * determined from the file extensions, or if the standard secondary resources for a standard primary resource - * cannot be found in the same director as the primary, an exception will be thrown. + * The "--suppress-resource-resolution" argument can be used to suppress secondary resource resolution behavior. + * If the type of the primary resource is not explicitly provided, or cannot be determined from the file extension, + * or if a standard secondary resource for a standard primary resource cannot be found in the same directory as + * the primary, an exception will be thrown. *

    - * Alternatively, you can explicitly specify all of the resources and their content types. This is useful when + * Alternatively, you can specify all of the resources and their content types explicitly. This is useful when * the resources are not in the same directory, or when the content types are not standard. *

    * For the primary resource, if the content type is not specified on the command line (content types are @@ -82,7 +82,7 @@ *

    Standard Content Types

    * In general, bundle content types can be any string, but many tools expect bundles to use standard, well known * content types that are pre-defined, such as content types for a VCF, a VCF index, a .fasta file, or a reference - * dictionary file. The common well known content types are: + * dictionary file. The common well known content type strings that can be used as command line argument tags are: *

    Standard VCF Content Types:

    *
      *
    • "CT_VARIANT_CONTEXTS": a VCF file
    • @@ -208,7 +208,7 @@ public class CreateBundle extends CommandLineProgram { shortName = StandardArgumentDefinitions.SECONDARY_RESOURCE_SHORT_NAME, doc = "Path to a secondary resource for" + StandardArgumentDefinitions.PRIMARY_RESOURCE_LONG_NAME + " (usually an index). If no secondary resource is specified, standard secondary resources" + - " for the primary bundle resource will be automatically inferred unless " + + " for the primary bundle resource will be automatically resolved unless " + SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME + " is specified.", optional = true) @@ -319,7 +319,7 @@ private Bundle createVCFBundle() { if (indexPath.isEmpty()) { throw new IllegalArgumentException( String.format( - "Could not infer an index for %s, you must either specify the index path as a secondary input on the command line or specify the %s argument.", + "Could not resolve an index for %s, you must either specify the index path as a secondary input on the command line or specify the %s argument.", primaryResource.getRawInputString(), SUPPRESS_SECONDARY_RESOURCE_RESOLUTION_FULL_NAME)); } @@ -344,7 +344,7 @@ private Bundle createVCFBundle() { private Bundle createHaploidReferenceBundle() { final Collection bundleResources = new ArrayList<>(); if (secondaryResource == null && (otherResources == null || otherResources.isEmpty())) { - // infer dictionary and index + // resolve dictionary and index return HaploidReferenceResolver.referenceBundleFromFastaPath(primaryResource, GATKPath::new); }