Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
maropu committed Apr 30, 2021
1 parent be9f62f commit ef68b61
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 0 deletions.
Empty file added dev/create-release/do-release
Empty file.
104 changes: 104 additions & 0 deletions dev/tpcds-datagen/do-datagen-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Generate TPC-DS data for TPCDSQUeryTestSuite.
# Run with "-h" for options.
#

set -e
SELF=$(cd $(dirname $0) && pwd)

# Re-uses helper funcs for the release scripts
. "$SELF/../create-release/release-util.sh"

function usage {
local NAME=$(basename $0)
cat <<EOF
Usage: $NAME [options]
This script generates TPC-DS data for TPCDSQUeryTestSuite inside a docker image. The image is hardcoded to be called
"spark-tpcds" and will be re-generated (as needed) on every invocation of this script.
Options are:
-d [path] : required: working directory (output will be written to an "output" directory in
the working directory).
EOF
}

WORKDIR=
IMGTAG=latest
while getopts ":d:h" opt; do
case $opt in
d) WORKDIR="$OPTARG" ;;
h) usage; exit 0 ;;
\?) error "Invalid option. Run with -h for help." ;;
esac
done

if [ -z "$WORKDIR" ] || [ ! -d "$WORKDIR" ]; then
error "Work directory (-d) must be defined and exist. Run with -h for help."
fi

if [ -d "$WORKDIR/output" ]; then
read -p "Output directory already exists. Overwrite and continue? [y/n] " ANSWER
if [ "$ANSWER" != "y" ]; then
error "Exiting."
fi
fi

cd "$WORKDIR"
rm -rf "$WORKDIR/output"
mkdir "$WORKDIR/output"

# Place all scripts in a local directory that must be defined in the command
# line. This directory is mounted into the image.
for f in "$SELF"/*; do
if [ -f "$f" ]; then
cp "$f" "$WORKDIR"
fi
done

# Place `release-util.sh` for reuse
cp "$SELF/../create-release/release-util.sh" "$WORKDIR"

run_silent "Building spark-tpcds image with tag $IMGTAG..." "docker-build.log" \
docker build -t "spark-tpcds:$IMGTAG" --build-arg UID=$UID "$SELF/spark-tpcds"

# Write the release information to a file with environment variables to be used when running the
# image.
ENVFILE="$WORKDIR/env.list"
fcreate_secure "$ENVFILE"

function cleanup {
rm -f "$ENVFILE"
}

trap cleanup EXIT

cat > $ENVFILE <<EOF
RUNNING_IN_DOCKER=1
EOF

echo "Building Spark to generate TPC-DS data; output will be at $WORKDIR/output/tpcds-data"
docker run -ti \
--env-file "$ENVFILE" \
--volume "$WORKDIR:/opt/spark-tpcds" \
"spark-tpcds:$IMGTAG"
56 changes: 56 additions & 0 deletions dev/tpcds-datagen/do-datagen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -e
SELF=$(cd $(dirname $0) && pwd)

# Re-uses helper funcs for the release scripts
if [ "$RUNNING_IN_DOCKER" = "1" ]; then
. "$SELF/release-util.sh"
else
. "$SELF/../create-release/release-util.sh"
fi

export LC_ALL=C.UTF-8
export LANG=C.UTF-8

# Checks out tpcds-kit and builds dsdgen
rm -rf tpcds-kit
git clone https://github.com/databricks/tpcds-kit
cd tpcds-kit/tools
run_silent "Building dsdgen in tpcds-kit..." "$SELF/dsdgen-build.log" make OS=LINUX
cd ../..

# Builds Spark to generate TPC-DS data
if [ -z "$SCALE_FACTOR" ]; then
SCALE_FACTOR=1
fi

rm -rf spark
# git clone https://github.com/apache/spark
SBT_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -XX:+UseG1GC"
git clone https://github.com/maropu/spark
cd spark
git checkout tpcdsDatagen
./build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir $SELF/tpcds-kit/tools --location $SELF/tpcds-data --scaleFactor $SCALE_FACTOR"
# run_silent "Building Spark to generate TPC-DS data in $SELF/tpcds-data..." "$SELF/spark-build.log" \
# ./build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir $SELF/tpcds-kit/tools --location $SELF/tpcds-data --scaleFactor $SCALE_FACTOR"
cd ..

rm -rf spark tpcds-kit
63 changes: 63 additions & 0 deletions dev/tpcds-datagen/spark-tpcds/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image to generate TPC-DS data for TPCDSQUeryTestSuite.
# Based on Ubuntu 20.04 to generate the same data with one on GitHub Actions.
#
# Includes:
# * Java 8
#
# You can test it as below:
# cd dev/tpcds-datagen
# docker build -t spark-tpcds --build-arg UID=$UID .

FROM ubuntu:20.04

# For apt to be noninteractive
ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

# These arguments are just for reuse and not really meant to be customized.
ARG APT_INSTALL="apt-get install --no-install-recommends -y"

# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch
# the most current package versions (instead of potentially using old versions cached by docker).
RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \
gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \
gpg -a --export E084DAB9 | apt-key add - && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean && \
apt-get update && \
$APT_INSTALL software-properties-common && \
apt-get update && \
# Install openjdk 8.
$APT_INSTALL openjdk-8-jdk && \
update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java && \
# Install build / source control tools
$APT_INSTALL curl wget git maven subversion make gcc bison flex lsof libffi-dev \
libssl-dev libcurl4-openssl-dev libxml2-dev && \
curl -sL https://deb.nodesource.com/setup_12.x | bash && \
$APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev

WORKDIR /opt/spark-tpcds/output

ARG UID
RUN useradd -m -s /bin/bash -p spark-tpcds -u $UID spark-tpcds
USER spark-tpcds:spark-tpcds

ENTRYPOINT [ "/opt/spark-tpcds/do-datagen.sh" ]

0 comments on commit ef68b61

Please sign in to comment.