From c2b6beb002e7863d03918ff58b05289f47779b18 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 16 Jul 2024 11:51:04 -0400 Subject: [PATCH] Add bundle support for FeatureInputs. --- .../cmdline/StandardArgumentDefinitions.java | 4 + .../hellbender/engine/FeatureDataSource.java | 90 ++++++-- .../hellbender/engine/FeatureInput.java | 42 ++++ .../hellbender/engine/MultiVariantWalker.java | 57 ++++- .../hellbender/tools/CreateBundle.java | 202 ++++++++++++++++++ .../hellbender/utils/gcs/BucketUtils.java | 16 +- .../engine/FeatureDataSourceUnitTest.java | 18 +- .../engine/FeatureInputUnitTest.java | 24 ++- .../MultiVariantWalkerIntegrationTest.java | 129 +++++++++-- .../tools/CreateBundleIntegrationTest.java | 176 +++++++++++++++ .../SelectVariantsIntegrationTest.java | 81 +++++++ .../testSelectVariants_BundleWith_idx.vcf | 160 ++++++++++++++ .../testSelectVariants_BundleWith_tbi.vcf | 160 ++++++++++++++ 13 files changed, 1098 insertions(+), 61 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java index 1c8596eb91b..40d6ccde930 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java @@ -56,6 +56,10 @@ private StandardArgumentDefinitions(){} public static final String INTERVALS_SHORT_NAME = "L"; public static final String COMPARISON_SHORT_NAME = "comp"; public static final String READ_INDEX_SHORT_NAME = READ_INDEX_LONG_NAME; + public static final String PRIMARY_INPUT_LONG_NAME = "primary"; + public static final String PRIMARY_INPUT_SHORT_NAME = "PI"; + public static final String SECONDARY_INPUT_LONG_NAME = "secondaryI"; + public static final String SECONDARY_INPUT_SHORT_NAME = "SI"; public static final String LENIENT_SHORT_NAME = "LE"; public static final String READ_VALIDATION_STRINGENCY_SHORT_NAME = "VS"; public static final String SAMPLE_ALIAS_SHORT_NAME = "ALIAS"; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java index bd6fad2f6d2..85eb4d46a92 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java @@ -1,5 +1,10 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.io.bundle.BundleResource; +import htsjdk.beta.io.bundle.BundleResourceType; +import htsjdk.io.IOPath; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Locatable; @@ -148,7 +153,7 @@ public FeatureDataSource(final File featureFile) { * generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) * during queries that produce cache misses. * - * @param featurePath path or URI to source of Features + * @param featurePath path or URI to source of Features (may be a Bundle) */ public FeatureDataSource(final String featurePath) { this(featurePath, null, DEFAULT_QUERY_LOOKAHEAD_BASES, null); @@ -159,7 +164,7 @@ public FeatureDataSource(final String featurePath) { * name. We will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) during queries * that produce cache misses. * - * @param featureFile file containing Features + * @param featureFile file or Bundle containing Features * @param name logical name for this data source (may be null) */ public FeatureDataSource(final File featureFile, final String name) { @@ -170,7 +175,7 @@ public FeatureDataSource(final File featureFile, final String name) { * Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical * name. We will look ahead the specified number of bases during queries that produce cache misses. * - * @param featureFile file containing Features + * @param featureFile file or Bundle containing Features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses */ @@ -181,7 +186,7 @@ public FeatureDataSource(final File featureFile, final String name, final int qu /** * Creates a FeatureDataSource backed by the resource at the provided path. * - * @param featurePath path to file or GenomicsDB url containing features + * @param featurePath path to file or GenomicsDB url or Bundle containing features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs @@ -195,7 +200,7 @@ public FeatureDataSource(final String featurePath, final String name, final int * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (or a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -207,7 +212,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook /** * Creates a FeatureDataSource backed by the resource at the provided path. * - * @param featurePath path to file or GenomicsDB url containing features + * @param featurePath path to file or GenomicsDB url or Bundle containing features * @param name logical name for this data source (may be null) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs @@ -224,7 +229,7 @@ public FeatureDataSource(final String featurePath, final String name, final int * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -241,7 +246,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -259,7 +264,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -278,7 +283,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -296,7 +301,7 @@ public FeatureDataSource(final FeatureInput featureInput, final int queryLook * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases * during queries that produce cache misses. * - * @param featureInput a FeatureInput specifying a source of Features + * @param featureInput a FeatureInput specifying a source of Features (may be a Bundle) * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses * @param targetFeatureType When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs * that produce this type of Feature. May be null, which results in an unrestricted search. @@ -369,9 +374,26 @@ private static FeatureReader getFeatureReader(final Featu } catch (final ClassCastException e) { throw new UserException("GenomicsDB inputs can only be used to provide VariantContexts.", e); } + } else if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + // the feature input specifies a serialized json bundle file + final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new); + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + // to get the codec we have to use the path of the underlying vcf resource, not the bundle path + final FeatureInput fi = new FeatureInput(vcfPath.getRawInputString(), featureInput.getName()); + final FeatureCodec codec = getCodecForFeatureInput(fi, targetFeatureType, setNameOnCodec); + // propagate the bundle path, not the vcf path, to the reader, so that downstream code can retrieve + // the index path from the bundle + return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); + } else if (featureInput.getParentBundle() != null) { + // the featureInput was created from a bundle list expansion (i.e, MultiVariantWalkers). it has the + // primary resource as the underlying resource path, and the containing bundle attached as the + // "parent bundle". Use the original FI to get the codec, but to get the feature reader, we use + // the FI that contains the bundle path, since the feature reader may require acccess to the index + final FeatureCodec codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec); + return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); } else { final FeatureCodec codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec); - if ( featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION) ) { + if (featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION)) { return new Reader(featureInput, codec); } return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper); @@ -419,18 +441,48 @@ private static FeatureReader getFeatureReader(final Featu private static AbstractFeatureReader getTribbleFeatureReader(final FeatureInput featureInput, final FeatureCodec codec, final Function cloudWrapper, final Function cloudIndexWrapper) { Utils.nonNull(codec); try { - // Must get the path to the data file from the codec here: - final String absoluteRawPath = featureInput.getRawInputString(); - // Instruct the reader factory to not require an index. We will require one ourselves as soon as // a query by interval is attempted. final boolean requireIndex = false; - // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. - if (BucketUtils.isEligibleForPrefetching(featureInput)) { - return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + if (featureInput.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + final Bundle vcfBundle = BundleJSON.toBundle(htsjdk.beta.plugin.IOUtils.getStringFromPath(featureInput), GATKPath::new); + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + final Optional vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX); + final String rawIndexResourcePath = + vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null; + + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + if (BucketUtils.isEligibleForPrefetching(vcfPath)) { + final String absoluteRawPath = vcfPath.getRawInputString(); + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(vcfPath.getRawInputString(), rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } + } else if (featureInput.getParentBundle() != null) { + final Bundle vcfBundle = featureInput.getParentBundle(); + // code path for when a user has specified multiple bundles on the command line, so there is no single + // serialized bundle file to access + final IOPath vcfPath = vcfBundle.getOrThrow(BundleResourceType.CT_VARIANT_CONTEXTS).getIOPath().get(); + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + final Optional vcfIndexPath = vcfBundle.get(BundleResourceType.CT_VARIANTS_INDEX); + final String rawIndexResourcePath = + vcfIndexPath.isPresent() ? vcfIndexPath.get().getIOPath().get().getRawInputString() : null; + final String absoluteRawPath = vcfPath.getRawInputString(); + if (BucketUtils.isEligibleForPrefetching(vcfPath)) { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, rawIndexResourcePath, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } } else { - return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + final String absoluteRawPath = featureInput.getRawInputString(); + + // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching. + if (BucketUtils.isEligibleForPrefetching(featureInput)) { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper); + } else { + return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction()); + } } } catch (final TribbleException e) { throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e); diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java index 53bcb117afc..801c92f13c6 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.engine; import com.google.common.annotations.VisibleForTesting; +import htsjdk.beta.io.bundle.Bundle; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import org.apache.logging.log4j.LogManager; @@ -52,6 +53,11 @@ public final class FeatureInput extends GATKPath implements S */ private transient Class> featureCodecClass; + /** + * retain any containing bundle in case we need to extract other resources from it + */ + private Bundle parentBundle; + /** * Delimiter between the logical name and the file name in the --argument_name logical_name:feature_file syntax */ @@ -129,6 +135,34 @@ public FeatureInput(final String rawInputSpecifier, final String name, final Map setTagAttributes(keyValueMap); } + /** + * Construct a FeatureInput from a Bundle. + * + * @param primaryResourcePath the path for the primary feature resource for this bundle + * @param featureBundle an existing Bundle object; resources in this bundle MUST be IOPathBundleResources (that is, + * they must be backed by an IOPath, not an in-memory object) + * @param name the tag name for this feature input - may be null + */ + public FeatureInput( + final GATKPath primaryResourcePath, + final Bundle featureBundle, + final String name) { + super(primaryResourcePath); + // retain the containing bundle for later so we can interrogate it for other resources, like the index + this.parentBundle = featureBundle; + if (name != null) { + if (primaryResourcePath.getTag() != null) { + logger.warn(String.format( + "FeatureInput: user-provided tag name %s will be replaced with %s", + primaryResourcePath.getTag(), + name)); + } + setTag(name); + } + + } + + /** * Remember the FeatureCodec class for this input the first time it is discovered so we can bypass dynamic codec * discovery when multiple FeatureDataSources are created for the same input. @@ -144,6 +178,14 @@ public void setFeatureCodecClass(final Class> featureCodecCla return this.featureCodecClass; } + /** + * @return the parent bundle for this FeatureInput, if this input was derived from a Bundle. May + * return {@code null}. The returned bundle can be interrogated for companion resources. + */ + public Bundle getParentBundle() { + return parentBundle; + } + /** * creates a name from the given filePath by finding the absolute path of the given input */ diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java index 7f12ff641e2..d1e63da1610 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java @@ -1,5 +1,10 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.io.bundle.BundleResource; +import htsjdk.beta.io.bundle.BundleResourceType; +import htsjdk.beta.plugin.IOUtils; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; @@ -74,17 +79,49 @@ public boolean doDictionaryCrossValidation() { @Override protected void initializeDrivingVariants() { multiVariantInputArgumentCollection.getDrivingVariantPaths().stream().forEach( - f -> { - FeatureInput featureInput = new FeatureInput<>(f); - if (drivingVariantsFeatureInputs.contains(featureInput)) { - throw new UserException.BadInput("Feature inputs must be unique: " + featureInput.toString()); + gatkPath -> { + if (gatkPath.hasExtension(BundleJSON.BUNDLE_EXTENSION)) { + // expand any bundle(s) into one or more FeatureInputs (depending on whether it is a single + // bundle or list) + final List bundles = BundleJSON.toBundleList(IOUtils.getStringFromPath(gatkPath), GATKPath::new); + for (final Bundle bundle : bundles) { + if (bundle.getPrimaryContentType().equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { + // use the bundle primary resource as the FeatureInput URI, and tear off and attach the + // individual bundle the bundle to the FI as the parent bundle so downstream code can + // extract other resources from it on demand + // note that if the original value from the user has a tag, we can't use it unless there + // is only one input, since FIs have to be unique + final FeatureInput bundleFI = new FeatureInput<>( + new GATKPath(bundle.getPrimaryResource().getIOPath().get().getURIString()), + bundle, + bundles.size() > 1 ? gatkPath.getTag() : "drivingVariants" + ); + if (drivingVariantsFeatureInputs.contains(bundleFI)) { + throw new UserException.BadInput("Feature inputs must be unique: " + gatkPath); + } + drivingVariantsFeatureInputs.add(bundleFI); + // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value + // of 0 to avoid caching because of windowed queries that need to "look behind" as well. + features.addToFeatureSources(0, bundleFI, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, + referenceArguments.getReferencePath()); + } else { + final BundleResource br = bundle.getPrimaryResource(); + throw new UserException.BadInput( + String.format( + "Feature input bundles must have a primary resource of type %s. Found content type %s with path %s", + BundleResourceType.CT_VARIANT_CONTEXTS, + bundle.getPrimaryContentType(), + br.getIOPath().get())); + } + } + } else { + final FeatureInput featureInput = new FeatureInput<>(gatkPath); + drivingVariantsFeatureInputs.add(featureInput); + // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value + // of 0 to avoid caching because of windowed queries that need to "look behind" as well. + features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, + referenceArguments.getReferencePath()); } - drivingVariantsFeatureInputs.add(featureInput); - - // Add each driving variants FeatureInput to the feature manager so that it can be queried, using a lookahead value - // of 0 to avoid caching because of windowed queries that need to "look behind" as well. - features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, - referenceArguments.getReferencePath()); } ); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java new file mode 100644 index 00000000000..cf943b0b62a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/CreateBundle.java @@ -0,0 +1,202 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.beta.io.bundle.*; +import htsjdk.beta.plugin.variants.VariantsBundle; +import htsjdk.samtools.util.FileExtensions; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import picard.cmdline.programgroups.OtherProgramGroup; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.StandardOpenOption; +import java.util.*; + +/** + * Create a bundle (JSON) file for use with a GATK tool. + * + * other inputs are NEVER inferred, and must always be provided with a content type tag. + */ +@DocumentedFeature +@CommandLineProgramProperties( + summary = "Create a bundle (JSON) file for use with a GATK tool", + oneLineSummary = "Create a bundle (JSON) file for use with a GATK tool", + programGroup = OtherProgramGroup.class +) +public class CreateBundle extends CommandLineProgram { + protected static final Logger logger = LogManager.getLogger(CreateBundle.class); + + public static final String SUPPRESS_INDEX_RESOLUTION_FULL_NAME = "suppress-index-resolution"; + public static final String OTHER_INPUT_FULL_NAME = "other-input"; + + @Argument(fullName = StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.PRIMARY_INPUT_SHORT_NAME, + doc="Path to the primary bundle input (content type will be inferred if no content type tag is specified)") + GATKPath primaryInput; + + @Argument(fullName = StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.SECONDARY_INPUT_SHORT_NAME, + doc = "Path to a secondary bundle input for" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + + "(usually an index). The type will be inferred if no content type tag is specified. If no "+ + " secondary input is specified, an index for the primary bundle file will be automatically inferred unless " + + SUPPRESS_INDEX_RESOLUTION_FULL_NAME + " is specified.", + optional = true) + GATKPath secondaryInput; + + @Argument(fullName = OTHER_INPUT_FULL_NAME, + shortName = OTHER_INPUT_FULL_NAME, + doc = "Path to other bundle inputs for " + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + + " A content type tag MUST be provided for each other input.", + optional = true) + List otherInputs; + + @Argument(fullName = SUPPRESS_INDEX_RESOLUTION_FULL_NAME, + doc ="Don't attempt to resolve the primary input's index file (defaults to false) if no secondary input is provided", + optional = true) + boolean suppressIndexResolution = false; + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Path the output bundle file (must end with the suffix .json.") + GATKPath outputBundlePath; + + private enum BundleType { + VCF, + REFERENCE, + OTHER + } + private BundleType outputBundleType; + + @Override + protected String[] customCommandLineValidation() { + if (!outputBundlePath.toString().endsWith(".json")){ + return new String[]{"Output bundle path must end with the suffix .json"}; + } + outputBundleType = determinePrimaryContentType(); + return super.customCommandLineValidation(); + } + + @Override + protected Object doWork() { + try (final BufferedWriter writer = Files.newBufferedWriter(outputBundlePath.toPath(), StandardOpenOption.CREATE)) { + final Bundle bundle = switch (outputBundleType) { + case VCF -> createVCFBundle(); + case REFERENCE -> + throw new IllegalArgumentException ("Reference bundles are not yet supported"); + case OTHER -> createOtherBundle(); + }; + writer.write(BundleJSON.toJSON(bundle)); + } catch (final IOException e) { + throw new RuntimeException(String.format("Failed writing bundle to output %s", outputBundlePath), e); + } + return null; + } + + private BundleType determinePrimaryContentType() { + BundleType bundleType; + + // determine the type of bundle to create; consult the tag attributes if any, otherwise try to infer from the + // primary input file extension + final String primaryContentTag = primaryInput.getTag(); + if (primaryContentTag != null && !primaryContentTag.isEmpty()) { + if (primaryContentTag.equals(BundleResourceType.CT_VARIANT_CONTEXTS)) { + bundleType = BundleType.VCF; + } else if (primaryContentTag.equals(BundleResourceType.CT_HAPLOID_REFERENCE)) { + bundleType = BundleType.REFERENCE; + } else { + logger.info(String.format("Primary input content type %s for %s not recognized. A bundle will be created using content typse from the provided argument tags.", + primaryContentTag, + primaryInput)); + bundleType = BundleType.OTHER; + } + } else { + logger.info(String.format("A content type for the primary input was not provided. Attempting to infer the content type from the %s extension.", primaryInput)); + bundleType = inferPrimaryContentType(primaryInput); + } + return bundleType; + } + + private BundleType inferPrimaryContentType(final GATKPath primaryInput) { + logger.info("Attempting to infer bundle content type from file extension."); + if (FileExtensions.VCF_LIST.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + return BundleType.VCF; + } else if (FileExtensions.FASTA.stream().anyMatch(ext -> primaryInput.hasExtension(ext))) { + return BundleType.REFERENCE; + } else { + throw new IllegalArgumentException(String.format("Unable to infer bundle content type from file extension %s. A content type must be provided as part of the argument.", primaryInput)); + } + } + + private Bundle createVCFBundle() { + final Collection bundleResources = new ArrayList<>(); + + bundleResources.add(new IOPathResource(primaryInput, BundleResourceType.CT_VARIANT_CONTEXTS)); + if (secondaryInput != null) { + final String secondaryContentType = secondaryInput.getTag(); + if (secondaryContentType == null) { + logger.info(String.format("A content type for the secondary input was not provided. Assuming %s is an index.", secondaryInput)); + bundleResources.add(new IOPathResource(secondaryInput, BundleResourceType.CT_VARIANTS_INDEX)); + } else { + bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + } + } else if (!suppressIndexResolution) { + // secondary input is null, and index resolution suppression is off + final Optional indexPath = VariantsBundle.resolveIndex(primaryInput, GATKPath::new); + if (indexPath.isEmpty()) { + throw new IllegalArgumentException( + String.format( + "Could not infer an index for %s, you must either specify the index path as a secondary input on the command line or specify the %s argument.", + primaryInput.getRawInputString(), + SUPPRESS_INDEX_RESOLUTION_FULL_NAME)); + } + bundleResources.add(new IOPathResource(indexPath.get(), BundleResourceType.CT_VARIANTS_INDEX)); + } + if (otherInputs != null) { + for (final GATKPath otherInput : otherInputs) { + final String otherContentType = otherInput.getTag(); + if (otherContentType == null) { + throw new IllegalArgumentException( + String.format( + "A content must be provided for \"other\" input %s.", + otherInput.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(otherInput, otherContentType)); + } + } + } + return new VariantsBundle(bundleResources); + } + + private Bundle createOtherBundle() { + final Collection bundleResources = new ArrayList<>(); + bundleResources.add(new IOPathResource(primaryInput, primaryInput.getTag())); + if (secondaryInput != null) { + final String secondaryContentType = secondaryInput.getTag(); + if (secondaryContentType == null) { + throw new IllegalArgumentException(String.format("A content type for the secondary input must be provided.")); + } else { + bundleResources.add(new IOPathResource(secondaryInput, secondaryContentType)); + } + } + if (otherInputs != null) { + for (final GATKPath otherInput : otherInputs) { + final String otherContentType = otherInput.getTag(); + if (otherContentType == null) { + throw new IllegalArgumentException( + String.format( + "A content type must be provided for \"other\" input %s.", + otherInput.getRawInputString())); + } else { + bundleResources.add(new IOPathResource(otherInput, otherContentType)); + } + } + } + return new Bundle(primaryInput.getTag(), bundleResources); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java index 9eeaffb6223..88cdb2a058b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/gcs/BucketUtils.java @@ -12,6 +12,7 @@ import com.google.cloud.storage.contrib.nio.SeekableByteChannelPrefetcher; import com.google.common.base.Strings; import com.google.common.io.ByteStreams; +import htsjdk.io.IOPath; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.RuntimeIOException; @@ -19,7 +20,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.Utils; @@ -68,20 +68,20 @@ public static boolean isGcsUrl(final String path) { } /** - * Return true if this {@code GATKPath} represents a gcs URI. + * Return true if this {@code IOPath} represents a gcs URI. * @param pathSpec specifier to inspect - * @return true if this {@code GATKPath} represents a gcs URI. + * @return true if this {@code IOPath} represents a gcs URI. */ - public static boolean isGcsUrl(final GATKPath pathSpec) { + public static boolean isGcsUrl(final IOPath pathSpec) { Utils.nonNull(pathSpec); return pathSpec.getScheme().equals(GoogleCloudStorageFileSystem.SCHEME); } /** * @param pathSpec specifier to inspect - * @return true if this {@code GATKPath} represents a remote storage system which may benefit from prefetching (gcs or http(s)) + * @return true if this {@code IOPath} represents a remote storage system which may benefit from prefetching (gcs or http(s)) */ - public static boolean isEligibleForPrefetching(final GATKPath pathSpec) { + public static boolean isEligibleForPrefetching(final IOPath pathSpec) { Utils.nonNull(pathSpec); return isEligibleForPrefetching(pathSpec.getScheme()); } @@ -320,10 +320,10 @@ public static long fileSize(String path) throws IOException { * Note that sub-directories are ignored - they are not recursed into. * Only supports HDFS and local paths. * - * @param pathSpecifier The URL to the file or directory whose size to return + * @param pathSpecifier The IOPath to the file or directory whose size to return * @return the total size of all files in bytes */ - public static long dirSize(final GATKPath pathSpecifier) { + public static long dirSize(final IOPath pathSpecifier) { try { // GCS case (would work with local too) if (isGcsUrl(pathSpecifier)) { diff --git a/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java index c8b0b42b9bf..634121291e4 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/FeatureDataSourceUnitTest.java @@ -1,5 +1,8 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.tribble.Feature; import htsjdk.variant.variantcontext.VariantContext; @@ -9,7 +12,6 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -413,6 +415,20 @@ public void testQueryGVCF( final SimpleInterval queryInterval, final List expectedVariantIDs ) { + final GATKPath vcfPath = new GATKPath(QUERY_TEST_GVCF.getAbsolutePath()); + final GATKPath indexPath = VariantsBundle.resolveIndex(vcfPath, GATKPath::new).get(); + final VariantsBundle vcfBundle = new VariantsBundle(vcfPath, indexPath); + final GATKPath bundleFile = new GATKPath(createTempFile("testQueryGVCFThroughBundle", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFile, BundleJSON.toJSON(vcfBundle)); + + try ( FeatureDataSource featureSource = new FeatureDataSource<>(bundleFile.toString()) ) { + final List queryResults = featureSource.queryAndPrefetch(queryInterval); + checkVariantQueryResults(queryResults, expectedVariantIDs, queryInterval); + } + } + /************************************************** * Direct testing on the FeatureCache inner class **************************************************/ diff --git a/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java index 9c09edb1a28..87257d61f6f 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/FeatureInputUnitTest.java @@ -1,5 +1,9 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.Bundle; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; import htsjdk.tribble.Feature; import htsjdk.tribble.FeatureCodec; import htsjdk.variant.variantcontext.VariantContext; @@ -7,7 +11,6 @@ import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineArgumentParser; import org.broadinstitute.barclay.argparser.CommandLineException; -import org.broadinstitute.barclay.argparser.CommandLineParser; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.testutils.SparkTestUtils; import org.testng.Assert; @@ -150,6 +153,25 @@ public void testHdfsPathAndName( final String argWithTags, final String inputVal Assert.assertEquals(hdfsInput.getName(), expectedLogicalName, "wrong logical name"); } + @Test(dataProvider = "GcsPathAndNameData", groups={"bucket"}) + public void testGcsBundlePathAndName( final String argWithTags, final String inputValue, final String unusedFeaturePath, final String expectedLogicalName ) { + // reuse the GcsPathAndNameData data provider, but with the input written to a bundle file + final Bundle bundleOfGcsPaths = new VariantsBundle(new GATKPath(inputValue)); + final GATKPath bundleFile = new GATKPath(createTempFile("testGcsBundlePathAndName", ".json").toString()); + IOPathUtils.writeStringToPath(new GATKPath(bundleFile.toString()), BundleJSON.toJSON(bundleOfGcsPaths)); + + final FeatureInput gcsInput = runCommandLineWithTaggedFeatureInput(argWithTags, bundleFile.toString()); + + Assert.assertEquals(gcsInput.getFeaturePath(), bundleFile.toString(), "wrong featurePath"); + if (argWithTags.contains(":")) { + Assert.assertEquals(gcsInput.getName(), expectedLogicalName, "wrong logical name"); + } else { + // if the input arg has no tags, then the expected logical name is the same as the input value, but + // in this test, the input value is actually the bundle json file name, not the original input value + Assert.assertEquals(gcsInput.getName(), bundleFile.toString(), "wrong logical name"); + } + } + @Test public void testFeatureNameSpecified() { final FeatureInput featureInput = runCommandLineWithTaggedFeatureInput("argName:myName", "myFile"); diff --git a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java index dca3fc7ee59..315fbe3458d 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerIntegrationTest.java @@ -1,5 +1,8 @@ package org.broadinstitute.hellbender.engine; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.*; +import htsjdk.io.IOPath; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -8,11 +11,16 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.gcs.BucketUtils; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -77,25 +85,25 @@ public void testDuplicateSources() throws Exception { @DataProvider(name="variantFiles") public Object[][] getVariantFiles() { return new Object[][] - { - { Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), null, 26 }, - { Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), null, 13 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), null, 26 }, - { Arrays.asList( - new File(getTestDataDir(), "splitVariants_1.vcf"), - new File(getTestDataDir(), "splitVariants_2.vcf")), null, 26 }, - - // with intervals - { Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), "1", 14 }, - { Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), "1", 7 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), "1", 14 }, - { Arrays.asList( - new File(getTestDataDir(), "interleavedVariants_1.vcf"), - new File(getTestDataDir(), "interleavedVariants_2.vcf")), "2:200-600", 3 }, + { + {Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), null, 26}, + {Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), null, 13}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), null, 26}, + {Arrays.asList( + new File(getTestDataDir(), "splitVariants_1.vcf"), + new File(getTestDataDir(), "splitVariants_2.vcf")), null, 26}, + + // with intervals + {Arrays.asList(new File(getTestDataDir(), "baseVariants.vcf")), "1", 14}, + {Arrays.asList(new File(getTestDataDir(), "interleavedVariants_1.vcf")), "1", 7}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), "1", 14}, + {Arrays.asList( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf")), "2:200-600", 3}, }; } @@ -115,10 +123,87 @@ public void testVariantOrder(final List inputFiles, final String interval, Assert.assertEquals(tool.count, expectedCount); } + // tests using bundles + @DataProvider(name="variantBundles") + public Object[][] getVariantBundles() throws IOException { + return new Object[][] + { + {createRemoteBundleForFile( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_1.vcf.idx"), + new File(getTestDataDir(), "interleavedVariants_2.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf.idx") + ), "1", 14}, + {createRemoteBundleForFile( + new File(getTestDataDir(), "interleavedVariants_1.vcf"), + new File(getTestDataDir(), "interleavedVariants_1.vcf.idx"), + new File(getTestDataDir(), "interleavedVariants_2.vcf"), + new File(getTestDataDir(), "interleavedVariants_2.vcf.idx") + ), "2:200-600", 3}, + }; + }; + + @Test(dataProvider = "variantBundles") + public void testVariantsFromBundleOrder(final IOPath inputBundleFile, final String interval, final int expectedCount) { + final TestMultiVariantWalker tool = new TestMultiVariantWalker(); + + final List args = new ArrayList<>(); + args.add("--variant"); + args.add(inputBundleFile.getRawInputString()); + if (interval != null) { + args.add("-L"); + args.add(interval); + } + + tool.instanceMain(args.toArray(new String[args.size()])); + Assert.assertEquals(tool.count, expectedCount); + } + + // copy two .vcfs to a temporary remote location; create a bundle list with two bundles, each containing + // a reference to the remote vcf and it's local companion index file, and then write the whole bundle out to + // a temporary remote bundle file + private static IOPath createRemoteBundleForFile( + final File vcf1, + final File index1, + final File vcf2, + final File index2) throws IOException { + //TODO: replace this path with getGCPTestStaging() + final String remotePath = BucketUtils.randomRemotePath("gs://hellbender/test/staging/remoteBundles", "remote_bundle_test", "dir"); + final Path remoteDirPath = IOUtils.getPath(remotePath + "/"); + + Files.createDirectory(remoteDirPath); + Assert.assertTrue(Files.exists(remoteDirPath)); + Assert.assertTrue(Files.isDirectory(remoteDirPath)); + final Path remoteVCF1 = IOUtils.getPath(remotePath + "/" + vcf1.getName()); + final Path remoteVCF2 = IOUtils.getPath(remotePath + "/" + vcf2.getName()); + final Path c_remoteVCF1 = Files.copy(vcf1.toPath(), remoteVCF1); + if (!c_remoteVCF1.equals(remoteVCF1)) { + throw new IOException("Not equal " + vcf1 + " to " + remoteVCF1); + } + final Path c_remoteVCF2 = Files.copy(vcf2.toPath(), remoteVCF2); + if (!c_remoteVCF2.equals(remoteVCF2)) { + throw new IOException("Not equal " + vcf2 + " to " + remoteVCF2); + } + + final Bundle variantsBundle1 = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteVCF1.toUri().toString()), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(index1.toURI().toString()), BundleResourceType.CT_VARIANTS_INDEX)) + .build(); + final Bundle variantsBundle2 = new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(remoteVCF2.toUri().toString()), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(index2.toURI().toString()), BundleResourceType.CT_VARIANTS_INDEX)) + .build(); + + final List bundles = Arrays.asList(variantsBundle1, variantsBundle2); + final GATKPath remoteBundlePath = new GATKPath(remoteDirPath.resolve("remote_bundle.json").toUri().toString()); + IOPathUtils.writeStringToPath(remoteBundlePath, BundleJSON.toJSON(bundles)); + return remoteBundlePath; + } + @CommandLineProgramProperties( - summary = "TestGATKToolWithFeatures", - oneLineSummary = "TestGATKToolWithFeatures", - programGroup = TestProgramGroup.class + summary = "TestGATKToolWithFeatures", + oneLineSummary = "TestGATKToolWithFeatures", + programGroup = TestProgramGroup.class ) private static final class TestMultiVariantWalkerIterator extends MultiVariantWalker { String expectedIDOrder[]; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java new file mode 100644 index 00000000000..a39cfd450f4 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/CreateBundleIntegrationTest.java @@ -0,0 +1,176 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.beta.io.bundle.*; +import htsjdk.beta.plugin.IOUtils; +import htsjdk.beta.plugin.variants.VariantsBundle; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CreateBundleIntegrationTest extends CommandLineProgramTest { + + // force our local paths to use absolute path names to make BundleResource and IOPath equality checks easier, + // since all serialized JSON bundles always contain absolute path names for local files + private final static String LOCAL_VCF = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf").getURIString(); + private final static String LOCAL_VCF_IDX = new GATKPath(getTestDataDir() + "/count_variants_withSequenceDict.vcf.idx").getURIString(); + private final static String LOCAL_VCF_GZIP = new GATKPath("src/test/resources/large/NA24385.vcf.gz").getURIString(); + private final static String LOCAL_VCF_TBI = new GATKPath("src/test/resources/large/NA24385.vcf.gz.tbi").getURIString(); + private final static String LOCAL_VCF_WITH_NO_INDEX = new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/count_variants_withSequenceDict_noIndex.vcf").getURIString(); + private final static String CLOUD_VCF = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf"; + private final static String CLOUD_VCF_IDX = "gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx"; + + private final static String PRIMARY_CT = "primary_ct"; + private final static String SECONDARY_CT = "secondary_ct"; + private final static String OTHER_CT = "other_ct"; + + @DataProvider(name = "bundleCases") + public Object[][] bundleCases() { + return new Object[][] { + // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle + + // common VCF bundle cases, with inferred content types + {LOCAL_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + {LOCAL_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, null, LOCAL_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + {LOCAL_VCF_GZIP, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + {LOCAL_VCF_GZIP, null, LOCAL_VCF_TBI, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF_GZIP), new GATKPath(LOCAL_VCF_TBI))}, + + {CLOUD_VCF, null, null, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF))}, + {CLOUD_VCF, null, null, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, true, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + {CLOUD_VCF, null, CLOUD_VCF_IDX, null, null, null, false, new VariantsBundle(new GATKPath(CLOUD_VCF), new GATKPath(CLOUD_VCF_IDX))}, + + // common vcf bundle cases, with explicit content types + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, true, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF), new GATKPath(LOCAL_VCF_IDX))}, + + // vcf bundle with a vcf, an index, and some other resource with an explicit content type + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, LOCAL_VCF_IDX, BundleResourceType.CT_VARIANTS_INDEX, Arrays.asList("someVariantsCompanion.txt"), Arrays.asList("someVariantsCT"), false, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), BundleResourceType.CT_VARIANT_CONTEXTS)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), BundleResourceType.CT_VARIANTS_INDEX)) + .addSecondary(new IOPathResource(new GATKPath(new GATKPath("someVariantsCompanion.txt").getURIString()), "someVariantsCT")) + .build()}, + + // "other" bundles + { + LOCAL_VCF, PRIMARY_CT, null, null, null, null, true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .build() + }, + { + LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, null, null, true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) + .build() + }, + { + // frankenbundle with multiple resources + LOCAL_VCF, PRIMARY_CT, LOCAL_VCF_IDX, SECONDARY_CT, Arrays.asList(LOCAL_VCF_TBI), Arrays.asList(OTHER_CT), true, + new BundleBuilder() + .addPrimary(new IOPathResource(new GATKPath(LOCAL_VCF), PRIMARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_IDX), SECONDARY_CT)) + .addSecondary(new IOPathResource(new GATKPath(LOCAL_VCF_TBI), OTHER_CT)) + .build() + }, + }; + } + + @DataProvider(name = "negativeBundleCases") + public Object[][] negativeBundleCases() { + return new Object[][] { + // primary, primary tag, secondary, secondary tag, other(s), other tag(s), suppressIndexResolution, expectedBundle + + // no index file can be inferred + {LOCAL_VCF_WITH_NO_INDEX, null, null, null, null, null, false, new VariantsBundle(new GATKPath(LOCAL_VCF_WITH_NO_INDEX))}, + // primary content type not provided and cannot be inferred from the extension + {"primaryFile.ext", null, null, null, null, null, false, null}, + // secondary content type not provided + {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", null, null, null, false, null}, + + // other bundle with other content type not provided + {"primaryFile.ext", PRIMARY_CT, "secondaryFile.ext", SECONDARY_CT, Arrays.asList("other.txt"), null, false, null}, + // vcf bundle with other content type not provided + {LOCAL_VCF, BundleResourceType.CT_VARIANT_CONTEXTS, null, null, Arrays.asList("other.txt"), null, false, null}, + }; + } + + @Test(dataProvider = "bundleCases") + public void testBundleCases( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean resolveIndex, + final Bundle expectedBundle) { + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + } + + @Test(dataProvider = "negativeBundleCases", expectedExceptions = IllegalArgumentException.class) + public void testNegativeBundleCases( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean resolveIndex, + final Bundle expectedBundle) { + doCreateBundleTest (primaryInput, primaryInputTag, secondaryInput, secondaryInputTag, otherInputs, otherInputTags, resolveIndex, expectedBundle); + } + + private void doCreateBundleTest( + final String primaryInput, + final String primaryInputTag, + final String secondaryInput, + final String secondaryInputTag, + final List otherInputs, + final List otherInputTags, + final boolean suppressIndexResolution, + final Bundle expectedBundle) { + final GATKPath outputPath = new GATKPath(createTempFile("test", ".bundle.json").getAbsolutePath().toString()); + + final List args = new ArrayList<>(); + + args.add("--" + StandardArgumentDefinitions.PRIMARY_INPUT_LONG_NAME + (primaryInputTag != null ? ":" + primaryInputTag : "")); + args.add(primaryInput); + if (secondaryInput != null) { + args.add("--" + StandardArgumentDefinitions.SECONDARY_INPUT_LONG_NAME + (secondaryInputTag != null ? ":" + secondaryInputTag : "")); + args.add(secondaryInput); + } + if (otherInputs != null) { + for (int i = 0; i < otherInputs.size(); i++) { + args.add("--" + CreateBundle.OTHER_INPUT_FULL_NAME + ((otherInputTags != null && otherInputTags.get(i) != null) ? ":" + otherInputTags.get(i) : "")); + args.add(otherInputs.get(i) != null ? otherInputs.get(i) : ""); + } + } + if (suppressIndexResolution == true) { + args.add("--" + CreateBundle.SUPPRESS_INDEX_RESOLUTION_FULL_NAME); + } + args.add("--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME); + args.add(outputPath.toString()); + + runCommandLine(args); + + final Bundle actualBundle = BundleJSON.toBundle(IOUtils.getStringFromPath(outputPath), GATKPath::new); + + // bundle resource order is not preserved when roundtripping through JSON, so compare ignoring order + Assert.assertTrue(Bundle.equalsIgnoreOrder(actualBundle, expectedBundle)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 3664e551a1b..4a50c4e9419 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -1,9 +1,15 @@ package org.broadinstitute.hellbender.tools.walkers.variantutils; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Comparator; import java.util.List; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.beta.io.bundle.BundleJSON; +import htsjdk.beta.plugin.variants.VariantsBundle; +import htsjdk.io.IOPath; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang3.tuple.Pair; @@ -1251,6 +1257,81 @@ public void testSampleSelectionOnNio() throws IOException { IntegrationTestSpec.assertEqualTextFiles(IOUtils.getPath(out), IOUtils.getPath(expectedFile), null); } + @DataProvider(name="cloudBucketBundles") + public Object[][] cloudBucketBundles() { + return new Object[][]{ + { + // cloud vcf with a .idx + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.1.1-65M.vcf"), + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.1.1-65M.vcf.idx"), + "1:20547", + "expected/testSelectVariants_BundleWith_idx.vcf" + }, + { + // cloud vcf with a .tbi + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf.blockgz.gz"), + new GATKPath("gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf.blockgz.gz.tbi"), + "21:10000128", + "expected/testSelectVariants_BundleWith_tbi.vcf" + } + }; + } + + @Test(dataProvider="cloudBucketBundles", groups = "bucket") + public void testBundleRemoteQueryWithVCFAndIndexInSeparateBuckets( + final IOPath vcfPath, + final IOPath indexPath, + final String queryInterval, + final String expectedOutputFile + ) throws IOException { + // test that a query using a bundle containing remote nio paths to a vcf works when the index is + // in a different bucket than the vcf + final String targetBucketName = BucketUtils.randomRemotePath(getGCPTestStaging(), "testSelectVariantsBundleOnNio", "") + "/"; + IOUtils.deleteOnExit(IOUtils.getPath(targetBucketName)); + final GATKPath targetVCFPath = new GATKPath(targetBucketName + vcfPath.toPath().getFileName()); + + // copy our test vcf (which is already in a bucket with it's accompanying index) to a different bucket where the + // index does not reside, and then create a bundle referencing the copy of the vcf and the original index, so + // the vcf and the index are in separate buckets + Files.copy(vcfPath.toPath(), targetVCFPath.toPath(), StandardCopyOption.REPLACE_EXISTING); + + // create a bundle with the copied vcf and the original index + final VariantsBundle vcfBundle = new VariantsBundle(new GATKPath(targetVCFPath), indexPath); + final GATKPath bundleFilePath = new GATKPath(createTempFile("testSelectVariantsBundleOnNio", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFilePath, BundleJSON.toJSON(vcfBundle)); + + final IntegrationTestSpec spec = new IntegrationTestSpec( + " --variant " + bundleFilePath + + " -L " + queryInterval + + " --suppress-reference-path " // suppress reference file path in output for test differencing + + " -O %s " + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false", + Collections.singletonList(getToolTestDataDir() + expectedOutputFile) + ); + + spec.executeTest("testBundle", this); + } + + @Test(groups = "bucket") + public void testBundleRemoteWithQueryParams() throws IOException { + final VariantsBundle vcfBundle = new VariantsBundle( + new GATKPath(BucketUtils.createSignedUrlToGcsObject("gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf", 1)), + new GATKPath(BucketUtils.createSignedUrlToGcsObject("gs://hellbender/test/resources/large/1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.idx", 1))); + final IOPath bundleFile = new GATKPath(createTempFile("testSelectVariantsThroughBundle", ".json").toString()); + IOPathUtils.writeStringToPath(bundleFile, BundleJSON.toJSON(vcfBundle)); + + final IntegrationTestSpec spec = new IntegrationTestSpec( + " --variant " + bundleFile.toString() + + " -L 20:10001365 " + + " --suppress-reference-path " // suppress reference file path in output for test differencing + + " -O %s " + + " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false", + Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_Bundle.vcf") + ); + + spec.executeTest("testBundle", this); + } + // the input test file is a somatic VCF with several many-allelic sites and no PLs. This tests that the tool does not attempt // to create a PL-to-alleles cache, which would cause the tool to freeze. See https://github.com/broadinstitute/gatk/issues/6291 @Test diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf new file mode 100644 index 00000000000..12d3cc32292 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_idx.vcf @@ -0,0 +1,160 @@ +##fileformat=VCFv4.2 +##FILTER= +##GATKCommandLine.SelectVariants= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency."> +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##dbSNP_BUILD_ID=138 +##fileDate=20130806 +##phasing=partial +##source=SelectVariants +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +1 20547 rs202246159 A G . . INT;OTHERKG;RS=202246159;RSPOS=20547;SAO=0;SSR=0;VC=SNV;VP=0x050000080001000002000100;WGT=1;dbSNPBuildID=137 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf new file mode 100644 index 00000000000..4e814e12379 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/SelectVariants/expected/testSelectVariants_BundleWith_tbi.vcf @@ -0,0 +1,160 @@ +##fileformat=VCFv4.2 +##FILTER= +##GATKCommandLine.SelectVariants= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO== 1% and for which 2 or more founders contribute to that minor allele frequency."> +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##dbSNP_BUILD_ID=138 +##fileDate=20130806 +##phasing=partial +##source=SelectVariants +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +21 10000128 rs372777878 C T . . OTHERKG;RS=372777878;RSPOS=10000128;SAO=0;SSR=0;VC=SNV;VP=0x050000000001000002000100;WGT=1;dbSNPBuildID=138