From 2c78df51161ff92207a506f1e542a3571e41d854 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Mon, 23 Oct 2023 22:25:58 -0700
Subject: [PATCH] Add a spark-shell smoke test to premerge and nightly
 [databricks] (#9504)

Contributes to #5704

This PR aims to catch issues like #9500. It modifies run_pyspark_from_build mostly to avoid recreating the logic of figuring out jar location etc.

Currently it may not catch this if do not have Spark 3.5.0 CI yet. But this is how it could reproduce the #9500

```Bash
$ SPARK_HOME=~/dist/spark-3.1.1-bin-hadoop3.2 SPARK_SHELL_SMOKE_TEST=1 ./integration_tests/run_pyspark_from_build.sh
...
+ grep -F 'res0: Array[org.apache.spark.sql.Row] = Array([4950])'
res0: Array[org.apache.spark.sql.Row] = Array([4950])
+ echo 'SUCCESS spark-shell smoke test...'
SUCCESS spark-shell smoke test
$ echo $?
0

$ SPARK_HOME=~/dist/spark-3.5.0-bin-hadoop3 SPARK_SHELL_SMOKE_TEST=1 ./integration_tests/run_pyspark_from_build.sh
$ echo $?
1

SPARK_SHELL_SMOKE_TEST=1 \
  PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.spark311.RapidsShuffleManager \
  SPARK_HOME=~/dist/spark-3.1.1-bin-hadoop3.2 \
  ./integration_tests/run_pyspark_from_build.sh
+ echo 'SUCCESS spark-shell smoke test'
SUCCESS spark-shell smoke test
$ echo $?
0

SPARK_SHELL_SMOKE_TEST=1 \
  PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.spark350.RapidsShuffleManager \
  SPARK_HOME=~/dist/spark-3.5.0-bin-hadoop3 \
  ./integration_tests/run_pyspark_from_build.sh
$ echo $?
1
```

Signed-off-by: Gera Shegalov <gera@apache.org>
---
 integration_tests/run_pyspark_from_build.sh | 30 ++++++++++++++++++++-
 jenkins/spark-premerge-build.sh             |  3 +++
 jenkins/spark-tests.sh                      |  4 +++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 853fae66316..e31e462e1a8 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -309,7 +309,35 @@ EOF
     fi
     export PYSP_TEST_spark_rapids_memory_gpu_allocSize=${PYSP_TEST_spark_rapids_memory_gpu_allocSize:-'1536m'}
 
-    if ((${#TEST_PARALLEL_OPTS[@]} > 0));
+    SPARK_SHELL_SMOKE_TEST="${SPARK_SHELL_SMOKE_TEST:-0}"
+    if [[ "${SPARK_SHELL_SMOKE_TEST}" != "0" ]]; then
+        echo "Running spark-shell smoke test..."
+        SPARK_SHELL_ARGS_ARR=(
+            --master local-cluster[1,2,1024]
+            --conf spark.plugins=com.nvidia.spark.SQLPlugin
+            --conf spark.deploy.maxExecutorRetries=0
+        )
+        if [[ "${PYSP_TEST_spark_shuffle_manager}" != "" ]]; then
+            SPARK_SHELL_ARGS_ARR+=(
+                --conf spark.shuffle.manager="${PYSP_TEST_spark_shuffle_manager}"
+                --driver-class-path "${PYSP_TEST_spark_driver_extraClassPath}"
+                --conf spark.executor.extraClassPath="${PYSP_TEST_spark_driver_extraClassPath}"
+            )
+        else
+            SPARK_SHELL_ARGS_ARR+=(--jars "${PYSP_TEST_spark_jars}")
+        fi
+
+        # NOTE grep is used not only for checking the output but also
+        # to workaround the fact that spark-shell catches all failures.
+        # In this test it exits not because of the failure but because it encounters
+        # an EOF on stdin and injects a ":quit" command. Without a grep check
+        # the exit code would be success 0 regardless of the exceptions.
+        #
+        <<< 'spark.range(100).agg(Map("id" -> "sum")).collect()' \
+            "${SPARK_HOME}"/bin/spark-shell "${SPARK_SHELL_ARGS_ARR[@]}" 2>/dev/null \
+            | grep -F 'res0: Array[org.apache.spark.sql.Row] = Array([4950])'
+        echo "SUCCESS spark-shell smoke test"
+    elif ((${#TEST_PARALLEL_OPTS[@]} > 0));
     then
         exec python "${RUN_TESTS_COMMAND[@]}" "${TEST_PARALLEL_OPTS[@]}" "${TEST_COMMON_OPTS[@]}"
     else
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 9b509208986..15c5166001b 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -88,6 +88,9 @@ mvn_verify() {
 
     # Triggering here until we change the jenkins file
     rapids_shuffle_smoke_test
+    SPARK_SHELL_SMOKE_TEST=1 \
+    PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.${SHUFFLE_SPARK_SHIM}.RapidsShuffleManager \
+        ./integration_tests/run_pyspark_from_build.sh
 }
 
 rapids_shuffle_smoke_test() {
diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
index 4a062f63871..e28799c28d4 100755
--- a/jenkins/spark-tests.sh
+++ b/jenkins/spark-tests.sh
@@ -270,6 +270,10 @@ TEST_MODE=${TEST_MODE:-'DEFAULT'}
 if [[ $TEST_MODE == "DEFAULT" ]]; then
   ./run_pyspark_from_build.sh
 
+  SPARK_SHELL_SMOKE_TEST=1 \
+  PYSP_TEST_spark_shuffle_manager=com.nvidia.spark.rapids.${SHUFFLE_SPARK_SHIM}.RapidsShuffleManager \
+    ./integration_tests/run_pyspark_from_build.sh
+
   # ParquetCachedBatchSerializer cache_test
   PYSP_TEST_spark_sql_cache_serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
     ./run_pyspark_from_build.sh -k cache_test