[HUDI-4982] Add Utilities and Utilities Slim + Spark Bundle testing t…

…o GH Actions (apache#7005) Co-authored-by: Raymond Xu <[email protected]>
onehouseinc · Dec 14, 2022 · d5745cc · d5745cc
1 parent 51d1f94
commit d5745cc
Show file tree

Hide file tree

Showing 12 changed files with 274 additions and 52 deletions.
diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml
@@ -72,7 +72,7 @@ jobs:
         if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
         run: |
           HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
-          ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
+          ./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
       - name: Spark SQL Test
         env:
           SCALA_PROFILE: ${{ matrix.scalaProfile }}

diff --git a/packaging/bundle-validation/Dockerfile b/packaging/bundle-validation/Dockerfile
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG IMAGE_TAG=spark313hive313
+FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG
+
+# configure the stack
+ADD . .
+ENV HUDI_CONF_DIR=$WORKDIR/conf
+RUN cp conf/hive-site.xml $HIVE_HOME/conf/
+RUN cp conf/hive-site.xml $SPARK_HOME/conf/
+RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
+RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
+RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
+    then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi
diff --git a/...lidation/spark-write-hive-sync/Dockerfile → packaging/bundle-validation/Dockerfile-base b/...lidation/spark-write-hive-sync/Dockerfile → packaging/bundle-validation/Dockerfile-base
@@ -18,8 +18,8 @@ FROM adoptopenjdk/openjdk8:alpine
 
 RUN apk add --no-cache --upgrade bash
 
-RUN mkdir /opt/hudi-bundles
-ENV WORKDIR=/opt/hudi-bundles
+RUN mkdir /opt/bundle-validation
+ENV WORKDIR=/opt/bundle-validation
 WORKDIR $WORKDIR
 
 ARG HADOOP_VERSION=2.7.7
@@ -47,10 +47,3 @@ RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK
     && tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
     && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
 ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
-
-RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
-COPY hive-site.xml $HIVE_HOME/conf/
-RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
-COPY spark-defaults.conf $SPARK_HOME/conf/
-COPY validate.scala .
-COPY validate.sh .
diff --git a/...alidation/spark-write-hive-sync/ci_run.sh → packaging/bundle-validation/ci_run.sh b/...alidation/spark-write-hive-sync/ci_run.sh → packaging/bundle-validation/ci_run.sh
@@ -18,15 +18,16 @@
 # under the License.
 
 # Note:
-# this script is to run by GitHub Actions CI tasks from the project root directory
-# and contains environment-specific variables
+#
+# This script is to
+#  - set the corresponding variables based on CI job's build profiles
+#  - prepare Hudi bundle jars for mounting into Docker container for validation
+#  - prepare test datasets for mounting into Docker container for validation
+#
+# This is to run by GitHub Actions CI tasks from the project root directory
+# and it contains the CI environment-specific variables.
 
 HUDI_VERSION=$1
-# to store bundle jars for validation
-mkdir ${GITHUB_WORKSPACE}/jars
-cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars
-echo 'Validating jars below:'
-ls -l ${GITHUB_WORKSPACE}/jars
 
 # choose versions based on build profiles
 if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
@@ -59,13 +60,33 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
   IMAGE_TAG=spark330hive313
 fi
 
-cd packaging/bundle-validation/spark-write-hive-sync || exit 1
+# Copy bundle jars to temp dir for mounting
+TMP_JARS_DIR=/tmp/jars/$(date +%s)
+mkdir -p $TMP_JARS_DIR
+cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
+cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
+cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
+echo 'Validating jars below:'
+ls -l $TMP_JARS_DIR
+
+# Copy test dataset
+TMP_DATA_DIR=/tmp/data/$(date +%s)
+mkdir -p $TMP_DATA_DIR/stocks/data
+cp ${GITHUB_WORKSPACE}/docker/demo/data/*.json $TMP_DATA_DIR/stocks/data/
+cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc $TMP_DATA_DIR/stocks/
+
+# build docker image
+cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
 docker build \
 --build-arg HADOOP_VERSION=$HADOOP_VERSION \
 --build-arg HIVE_VERSION=$HIVE_VERSION \
 --build-arg DERBY_VERSION=$DERBY_VERSION \
 --build-arg SPARK_VERSION=$SPARK_VERSION \
 --build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
+--build-arg IMAGE_TAG=$IMAGE_TAG \
 -t hudi-ci-bundle-validation:$IMAGE_TAG \
 .
-docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
+
+# run validation script in docker
+docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -v $TMP_DATA_DIR:/opt/bundle-validation/data \
+  -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
diff --git a/...ation/spark-write-hive-sync/hive-site.xml → ...ging/bundle-validation/conf/hive-site.xml b/...ation/spark-write-hive-sync/hive-site.xml → ...ging/bundle-validation/conf/hive-site.xml
diff --git a/...spark-write-hive-sync/spark-defaults.conf → ...bundle-validation/conf/hudi-defaults.conf b/...spark-write-hive-sync/spark-defaults.conf → ...bundle-validation/conf/hudi-defaults.conf
@@ -15,6 +15,8 @@
 # limitations under the License.
 #
 
-spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
+hoodie.upsert.shuffle.parallelism       8
+hoodie.insert.shuffle.parallelism       8
+hoodie.delete.shuffle.parallelism       8
+hoodie.bulkinsert.shuffle.parallelism   8
+hoodie.finalize.write.parallelism       8
diff --git a/packaging/bundle-validation/conf/spark-defaults.conf b/packaging/bundle-validation/conf/spark-defaults.conf
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.serializer                  org.apache.spark.serializer.KryoSerializer
+spark.sql.extensions              org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+spark.sql.warehouse.dir           file:///tmp/hudi-bundles/hive/warehouse
+spark.default.parallelism         8
+spark.sql.shuffle.partitions      8
diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.sh b/packaging/bundle-validation/spark-write-hive-sync/validate.sh
diff --git a/...tion/spark-write-hive-sync/validate.scala → ...ng/bundle-validation/spark/validate.scala b/...tion/spark-write-hive-sync/validate.scala → ...ng/bundle-validation/spark/validate.scala
diff --git a/packaging/bundle-validation/utilities/hoodieapp.properties b/packaging/bundle-validation/utilities/hoodieapp.properties
@@ -0,0 +1,23 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+
+hoodie.datasource.write.recordkey.field=key
+hoodie.datasource.write.partitionpath.field=date
+hoodie.datasource.write.precombine.field=ts
+hoodie.metadata.enable=true
+hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/data/stocks/data
+hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
+hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
diff --git a/packaging/bundle-validation/utilities/validate.scala b/packaging/bundle-validation/utilities/validate.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
+val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/stocks/data")
+val hudiCount = hudiDf.select("date", "key").distinct.count
+val srcCount = inputDf.select("date", "key").distinct.count
+if (hudiCount == srcCount) System.exit(0)
+println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
+System.exit(1)
diff --git a/packaging/bundle-validation/validate.sh b/packaging/bundle-validation/validate.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#################################################################################################
+# NOTE: this script runs inside hudi-ci-bundle-validation container
+# $WORKDIR/jars/ is to mount to a host directory where bundle jars are placed
+# $WORKDIR/data/ is to mount to a host directory where test data are placed with structures like
+#    - <dataset name>/schema.avsc
+#    - <dataset name>/data/<data files>
+#################################################################################################
+
+WORKDIR=/opt/bundle-validation
+JARS_DIR=${WORKDIR}/jars
+# link the jar names to easier to use names
+ln -sf $JARS_DIR/hudi-spark*.jar $JARS_DIR/spark.jar
+ln -sf $JARS_DIR/hudi-utilities-bundle*.jar $JARS_DIR/utilities.jar
+ln -sf $JARS_DIR/hudi-utilities-slim*.jar $JARS_DIR/utilities-slim.jar
+
+
+##
+# Function to test the spark bundle with hive sync.
+#
+# env vars (defined in container):
+#   HIVE_HOME: path to the hive directory
+#   DERBY_HOME: path to the derby directory
+#   SPARK_HOME: path to the spark directory
+##
+test_spark_bundle () {
+    echo "::warning::validate.sh setting up hive metastore for spark bundle validation"
+
+    $DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
+    $HIVE_HOME/bin/hiveserver2 &
+    echo "::warning::validate.sh hive metastore setup complete. Testing"
+    $SPARK_HOME/bin/spark-shell --jars $JARS_DIR/spark.jar < $WORKDIR/spark/validate.scala
+    if [ "$?" -ne 0 ]; then
+        echo "::error::validate.sh failed hive testing"
+        exit 1
+    fi
+    echo "::warning::validate.sh spark bundle validation successful"
+}
+
+
+##
+# Function to test the utilities bundle and utilities slim bundle + spark bundle.
+# It runs deltastreamer and then verifies that deltastreamer worked correctly.
+#
+# 1st arg: main jar to run with spark-submit, usually it's the utilities(-slim) bundle
+# 2nd arg and beyond: any additional jars to pass to --jars option
+#
+# env vars (defined in container):
+#   SPARK_HOME: path to the spark directory
+##
+test_utilities_bundle () {
+    MAIN_JAR=$1
+    printf -v EXTRA_JARS '%s,' "${@:2}"
+    EXTRA_JARS="${EXTRA_JARS%,}"
+    OPT_JARS=""
+    if [[ -n $EXTRA_JARS ]]; then
+        OPT_JARS="--jars $EXTRA_JARS"
+    fi
+    OUTPUT_DIR=/tmp/hudi-utilities-test/
+    rm -r $OUTPUT_DIR
+    echo "::warning::validate.sh running deltastreamer"
+    $SPARK_HOME/bin/spark-submit \
+    --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
+    $OPT_JARS $MAIN_JAR \
+    --props $WORKDIR/utilities/hoodieapp.properties \
+    --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
+    --source-class org.apache.hudi.utilities.sources.JsonDFSSource \
+    --source-ordering-field ts --table-type MERGE_ON_READ \
+    --target-base-path ${OUTPUT_DIR} \
+    --target-table utilities_tbl  --op UPSERT
+    if [ "$?" -ne 0 ]; then
+        echo "::error::validate.sh deltastreamer failed with exit code $?"
+        exit 1
+    fi
+    echo "::warning::validate.sh done with deltastreamer"
+
+    OUTPUT_SIZE=$(du -s ${OUTPUT_DIR} | awk '{print $1}')
+    if [[ -z $OUTPUT_SIZE || "$OUTPUT_SIZE" -lt "580" ]]; then
+        echo "::error::validate.sh deltastreamer output folder ($OUTPUT_SIZE) is smaller than minimum expected (580)" 
+        exit 1
+    fi
+
+    echo "::warning::validate.sh validating deltastreamer in spark shell"
+    SHELL_COMMAND="$SPARK_HOME/bin/spark-shell $OPT_JARS $MAIN_JAR -i $WORKDIR/utilities/validate.scala"
+    echo "::debug::this is the shell command: $SHELL_COMMAND"
+    LOGFILE="$WORKDIR/${FUNCNAME[0]}.log"
+    $SHELL_COMMAND >> $LOGFILE
+    if [ "$?" -ne 0 ]; then
+        SHELL_RESULT=$(cat $LOGFILE | grep "Counts don't match")
+        echo "::error::validate.sh $SHELL_RESULT"
+        exit 1
+    fi
+    echo "::warning::validate.sh done validating deltastreamer in spark shell"
+}
+
+
+test_spark_bundle
+if [ "$?" -ne 0 ]; then
+    exit 1
+fi
+
+if [[ $SPARK_HOME == *"spark-2.4"* ]] || [[  $SPARK_HOME == *"spark-3.1"* ]]
+then
+  echo "::warning::validate.sh testing utilities bundle"
+  test_utilities_bundle $JARS_DIR/utilities.jar
+  if [ "$?" -ne 0 ]; then
+      exit 1
+  fi
+  echo "::warning::validate.sh done testing utilities bundle"
+else
+  echo "::warning::validate.sh skip testing utilities bundle for non-spark2.4 & non-spark3.1 build"
+fi
+
+echo "::warning::validate.sh testing utilities slim bundle"
+test_utilities_bundle $JARS_DIR/utilities-slim.jar $JARS_DIR/spark.jar
+if [ "$?" -ne 0 ]; then
+    exit 1
+fi
+echo "::warning::validate.sh done testing utilities slim bundle"