Skip to content

Commit

Permalink
Add a script to generate TPC-DS data on docker-image(Ubuntu 20.04)
Browse files Browse the repository at this point in the history
  • Loading branch information
maropu committed Apr 30, 2021
1 parent be9f62f commit 70244e4
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 0 deletions.
Empty file added dev/create-release/do-release
Empty file.
104 changes: 104 additions & 0 deletions dev/tpcds-datagen/do-datagen-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Generate TPC-DS data for TPCDSQUeryTestSuite.
# Run with "-h" for options.
#

set -e
SELF=$(cd $(dirname $0) && pwd)

# Re-uses helper funcs for the release scripts
. "$SELF/../create-release/release-util.sh"

function usage {
local NAME=$(basename $0)
cat <<EOF
Usage: $NAME [options]
This script generates TPC-DS data for TPCDSQUeryTestSuite inside a docker image. The image is hardcoded to be called
"spark-tpcds" and will be re-generated (as needed) on every invocation of this script.
Options are:
-d [path] : required: working directory (output will be written to an "output" directory in
the working directory).
EOF
}

WORKDIR=
IMGTAG=latest
while getopts ":d:h" opt; do
case $opt in
d) WORKDIR="$OPTARG" ;;
h) usage; exit 0 ;;
\?) error "Invalid option. Run with -h for help." ;;
esac
done

if [ -z "$WORKDIR" ] || [ ! -d "$WORKDIR" ]; then
error "Work directory (-d) must be defined and exist. Run with -h for help."
fi

if [ -d "$WORKDIR/output" ]; then
read -p "Output directory already exists. Overwrite and continue? [y/n] " ANSWER
if [ "$ANSWER" != "y" ]; then
error "Exiting."
fi
fi

cd "$WORKDIR"
rm -rf "$WORKDIR/output"
mkdir "$WORKDIR/output"

# Place all scripts in a local directory that must be defined in the command
# line. This directory is mounted into the image.
for f in "$SELF"/*; do
if [ -f "$f" ]; then
cp "$f" "$WORKDIR"
fi
done

# Place `release-util.sh` for reuse
cp "$SELF/../create-release/release-util.sh" "$WORKDIR"

run_silent "Building spark-tpcds image with tag $IMGTAG..." "docker-build.log" \
docker build -t "spark-tpcds:$IMGTAG" --build-arg UID=$UID "$SELF/spark-tpcds"

# Write the release information to a file with environment variables to be used when running the
# image.
ENVFILE="$WORKDIR/env.list"
fcreate_secure "$ENVFILE"

function cleanup {
rm -f "$ENVFILE"
}

trap cleanup EXIT

cat > $ENVFILE <<EOF
RUNNING_IN_DOCKER=1
EOF

echo "Building Spark to generate TPC-DS data; output will be at $WORKDIR/output/tpcds-data"
docker run -ti \
--env-file "$ENVFILE" \
--volume "$WORKDIR:/opt/spark-tpcds" \
"spark-tpcds:$IMGTAG"
51 changes: 51 additions & 0 deletions dev/tpcds-datagen/do-datagen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -e
SELF=$(cd $(dirname $0) && pwd)

# Re-uses helper funcs for the release scripts
if [ "$RUNNING_IN_DOCKER" = "1" ]; then
. "$SELF/release-util.sh"
else
. "$SELF/../create-release/release-util.sh"
fi

# Checks out tpcds-kit and builds dsdgen
rm -rf tpcds-kit
git clone https://github.com/databricks/tpcds-kit
cd tpcds-kit/tools
run_silent "Building dsdgen in tpcds-kit..." "$SELF/dsdgen-build.log" make OS=LINUX
DSDGEN=`pwd`
cd ../..

# Builds Spark to generate TPC-DS data
OUTPUT_PATH=`pwd`/tpcds-data
if [ -z "$SCALE_FACTOR" ]; then
SCALE_FACTOR=1
fi

rm -rf spark
git clone https://github.com/apache/spark
cd spark
run_silent "Building Spark to generate TPC-DS data in $OUTPUT_PATH..." "$SELF/spark-build.log" \
./build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir $DSDGEN --location $OUTPUT_PATH --scaleFactor $SCALE_FACTOR --numPartitions 1 --overwrite"
cd ..

rm -rf spark tpcds-kit
56 changes: 56 additions & 0 deletions dev/tpcds-datagen/spark-tpcds/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image to generate TPC-DS data for TPCDSQUeryTestSuite.
# Based on Ubuntu 20.04 to generate the same data with one on GitHub Actions.
#
# Includes:
# * Java 8
#
# You can test it as below:
# cd dev/tpcds-datagen
# docker build -t spark-tpcds --build-arg UID=$UID .

FROM ubuntu:20.04

# For apt to be noninteractive
ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

# These arguments are just for reuse and not really meant to be customized.
ARG APT_INSTALL="apt-get install --no-install-recommends -y"

# This is all in a single "RUN" command so that if anything changes, "apt-get update" is run to fetch
# the most current package versions (instead of potentially using old versions cached by docker).
RUN apt-get clean && apt-get update && \
$APT_INSTALL software-properties-common && \
apt-get update && \
# Install openjdk 8
$APT_INSTALL openjdk-8-jdk && \
update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java && \
# Install build-related tools
$APT_INSTALL curl wget git build-essential bison flex && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

WORKDIR /opt/spark-tpcds/output

ARG UID
RUN useradd -m -s /bin/bash -p spark-tpcds -u $UID spark-tpcds
USER spark-tpcds:spark-tpcds

ENTRYPOINT [ "/opt/spark-tpcds/do-datagen.sh" ]

0 comments on commit 70244e4

Please sign in to comment.