diff --git a/bioindex/src/main/resources/compression.sh b/bioindex/src/main/resources/compression.sh new file mode 100644 index 00000000..e143bb44 --- /dev/null +++ b/bioindex/src/main/resources/compression.sh @@ -0,0 +1,3 @@ +#!/bin/bash -xe + +sudo aws s3 cp s3://dig-data-registry/hail.jar /usr/lib/spark/jars/ diff --git a/bioindex/src/main/resources/geneExpression.py b/bioindex/src/main/resources/geneExpression.py index 7f40f801..382df089 100644 --- a/bioindex/src/main/resources/geneExpression.py +++ b/bioindex/src/main/resources/geneExpression.py @@ -16,6 +16,7 @@ def main(): # sort and write df.orderBy(['gene']) \ .write \ + .option("compression", "is.hail.io.compress.BGzipCodec") \ .mode('overwrite') \ .json(outdir) diff --git a/bioindex/src/main/scala/GeneExpression.scala b/bioindex/src/main/scala/GeneExpressionStage.scala similarity index 88% rename from bioindex/src/main/scala/GeneExpression.scala rename to bioindex/src/main/scala/GeneExpressionStage.scala index 0f1e052e..e0042a5e 100644 --- a/bioindex/src/main/scala/GeneExpression.scala +++ b/bioindex/src/main/scala/GeneExpressionStage.scala @@ -1,8 +1,6 @@ package org.broadinstitute.dig.aggregator.methods.bioindex import org.broadinstitute.dig.aggregator.core._ -import org.broadinstitute.dig.aws._ -import org.broadinstitute.dig.aws.config.emr._ import org.broadinstitute.dig.aws.emr._ /** The final result of all aggregator methods is building the BioIndex. All @@ -21,7 +19,8 @@ class GeneExpressionStage(implicit context: Context) extends Stage { /** Use latest EMR release. */ override val cluster: ClusterDef = super.cluster.copy( - releaseLabel = ReleaseLabel.emrLatest + releaseLabel = ReleaseLabel.emrLatest, + bootstrapScripts = Seq(new BootstrapScript(resourceUri("compression.sh"))) ) /** Output to Job steps. */