Skip to content

Commit

Permalink
Merge branch 'apache:master' into SPARK-49162
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanK-db authored Sep 10, 2024
2 parents 4b0865a + 8f66272 commit e12cd61
Show file tree
Hide file tree
Showing 1,424 changed files with 71,351 additions and 55,699 deletions.
26 changes: 15 additions & 11 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"
path: "**/target/*.log"

infra-image:
name: "Base image build"
Expand Down Expand Up @@ -723,7 +723,7 @@ jobs:
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy==1.26.4' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
Expand Down Expand Up @@ -867,20 +867,26 @@ jobs:
# - docs/README.md
gem install bundler -v 2.4.22
cd docs
bundle install
bundle install --retry=100
- name: Run documentation build
run: |
# We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages.
ln -s "$(which python3.9)" "/usr/local/bin/python3"
# Build docs first with SKIP_API to ensure they are buildable without requiring any
# language docs to be built beforehand.
cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
if [ -f "./dev/is-changed.py" ]; then
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
fi
# Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
echo "SKIP_SCALADOC: $SKIP_SCALADOC"
echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
echo "SKIP_RDOC: $SKIP_RDOC"
echo "SKIP_SQLDOC: $SKIP_SQLDOC"
cd docs
bundle exec jekyll build
- name: Tar documentation
Expand Down Expand Up @@ -1106,14 +1112,12 @@ jobs:
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: start minikube
run: |
# See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
sudo install minikube-linux-amd64 /usr/local/bin/minikube
rm minikube-linux-amd64
- name: Start Minikube
uses: medyagh/[email protected]
with:
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
minikube start --cpus 2 --memory 6144
cpus: 2
memory: 6144m
- name: Print K8S pods and nodes info
run: |
kubectl get pods -A
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/build_maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,3 @@ jobs:
name: Run
uses: ./.github/workflows/maven_test.yml
if: github.repository == 'apache/spark'
with:
envs: >-
{
"SKIP_SPARK_RELEASE_VERSIONS": "3.4.2"
}
1 change: 1 addition & 0 deletions .github/workflows/build_sparkr_window.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
shell: cmd
env:
NOT_CRAN: true
SPARKR_SUPPRESS_DEPRECATION_WARNING: 1
# See SPARK-27848. Currently installing some dependent packages causes
# "(converted from warning) unable to identify current timezone 'C':" for an unknown reason.
# This environment variable works around to test SparkR against a higher version.
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Download test results to report
uses: dawidd6/action-download-artifact@09385b76de790122f4da9c82b17bccf858b9557c # pin@v2
uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # pin @v6
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
workflow: ${{ github.event.workflow_run.workflow_id }}
commit: ${{ github.event.workflow_run.head_commit.id }}
workflow_conclusion: completed
- name: Publish test report
uses: scacap/action-surefire-report@482f012643ed0560e23ef605a79e8e87ca081648 # pin@v1
uses: scacap/action-surefire-report@a2911bd1a4412ec18dde2d93b1758b3e56d2a880 # pin @v1.8.0
with:
check_name: Report test results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*.swp
*~
.java-version
.python-version
.DS_Store
.ammonite
.bloop
Expand All @@ -26,6 +27,7 @@
.scala_dependencies
.settings
.vscode
artifacts/
/lib/
R-unit-tests.log
R/unit-tests.out
Expand Down
12 changes: 5 additions & 7 deletions LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,11 @@ Python Software Foundation License
python/pyspark/loose_version.py


BSD 0-Clause
------------
org.tukaani:xz


BSD 2-Clause
------------
com.github.luben:zstd-jni
Expand Down Expand Up @@ -508,7 +513,6 @@ Eclipse Distribution License (EDL) 1.0
com.sun.istack:istack-commons-runtime
jakarta.xml.bind:jakarta.xml.bind-api
org.glassfish.jaxb:jaxb-runtime
org.glassfish.jaxb:txw2

Eclipse Public License (EPL) 2.0
--------------------------------
Expand All @@ -521,12 +525,6 @@ org.glassfish.hk2:hk2-locator
org.glassfish.hk2:hk2-utils
org.glassfish.hk2:osgi-resource-locator


Public Domain
-------------
org.tukaani:xz


Creative Commons CC0 1.0 Universal Public Domain Dedication
-----------------------------------------------------------
(see LICENSE-CC0.txt)
Expand Down
75 changes: 30 additions & 45 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3965,19 +3965,11 @@ setMethod("row_number",
#' yields unresolved \code{a.b.c}
#' @return Column object wrapping JVM UnresolvedNamedLambdaVariable
#' @keywords internal
unresolved_named_lambda_var <- function(...) {
jc <- newJObject(
"org.apache.spark.sql.Column",
newJObject(
"org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
lapply(list(...), function(x) {
handledCallJStatic(
"org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
"freshVarName",
x)
})
)
)
unresolved_named_lambda_var <- function(name) {
jc <- handledCallJStatic(
"org.apache.spark.sql.api.python.PythonSQLUtils",
"unresolvedNamedLambdaVariable",
name)
column(jc)
}

Expand All @@ -3990,7 +3982,6 @@ unresolved_named_lambda_var <- function(...) {
#' @return JVM \code{LambdaFunction} object
#' @keywords internal
create_lambda <- function(fun) {
as_jexpr <- function(x) callJMethod(x@jc, "expr")

# Process function arguments
parameters <- formals(fun)
Expand All @@ -4011,22 +4002,18 @@ create_lambda <- function(fun) {
stopifnot(class(result) == "Column")

# Convert both Columns to Scala expressions
jexpr <- as_jexpr(result)

jargs <- handledCallJStatic(
"org.apache.spark.api.python.PythonUtils",
"toSeq",
handledCallJStatic(
"java.util.Arrays", "asList", lapply(args, as_jexpr)
)
handledCallJStatic("java.util.Arrays", "asList", lapply(args, function(x) { x@jc }))
)

# Create Scala LambdaFunction
newJObject(
"org.apache.spark.sql.catalyst.expressions.LambdaFunction",
jexpr,
jargs,
FALSE
handledCallJStatic(
"org.apache.spark.sql.api.python.PythonSQLUtils",
"lambdaFunction",
result@jc,
jargs
)
}

Expand All @@ -4039,20 +4026,18 @@ create_lambda <- function(fun) {
#' @return a \code{Column} representing name applied to cols with funs
#' @keywords internal
invoke_higher_order_function <- function(name, cols, funs) {
as_jexpr <- function(x) {
as_col <- function(x) {
if (class(x) == "character") {
x <- column(x)
}
callJMethod(x@jc, "expr")
x@jc
}

jexpr <- do.call(newJObject, c(
paste("org.apache.spark.sql.catalyst.expressions", name, sep = "."),
lapply(cols, as_jexpr),
lapply(funs, create_lambda)
))

column(newJObject("org.apache.spark.sql.Column", jexpr))
jcol <- handledCallJStatic(
"org.apache.spark.sql.api.python.PythonSQLUtils",
"fn",
name,
c(lapply(cols, as_col), lapply(funs, create_lambda))) # check varargs invocation
column(jcol)
}

#' @details
Expand All @@ -4068,7 +4053,7 @@ setMethod("array_aggregate",
signature(x = "characterOrColumn", initialValue = "Column", merge = "function"),
function(x, initialValue, merge, finish = NULL) {
invoke_higher_order_function(
"ArrayAggregate",
"aggregate",
cols = list(x, initialValue),
funs = if (is.null(finish)) {
list(merge)
Expand Down Expand Up @@ -4129,7 +4114,7 @@ setMethod("array_exists",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"ArrayExists",
"exists",
cols = list(x),
funs = list(f)
)
Expand All @@ -4145,7 +4130,7 @@ setMethod("array_filter",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"ArrayFilter",
"filter",
cols = list(x),
funs = list(f)
)
Expand All @@ -4161,7 +4146,7 @@ setMethod("array_forall",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"ArrayForAll",
"forall",
cols = list(x),
funs = list(f)
)
Expand Down Expand Up @@ -4291,7 +4276,7 @@ setMethod("array_sort",
column(callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc))
} else {
invoke_higher_order_function(
"ArraySort",
"array_sort",
cols = list(x),
funs = list(comparator)
)
Expand All @@ -4309,7 +4294,7 @@ setMethod("array_transform",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"ArrayTransform",
"transform",
cols = list(x),
funs = list(f)
)
Expand Down Expand Up @@ -4374,7 +4359,7 @@ setMethod("arrays_zip_with",
signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
function(x, y, f) {
invoke_higher_order_function(
"ZipWith",
"zip_with",
cols = list(x, y),
funs = list(f)
)
Expand Down Expand Up @@ -4447,7 +4432,7 @@ setMethod("map_filter",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"MapFilter",
"map_filter",
cols = list(x),
funs = list(f))
})
Expand Down Expand Up @@ -4504,7 +4489,7 @@ setMethod("transform_keys",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"TransformKeys",
"transform_keys",
cols = list(x),
funs = list(f)
)
Expand All @@ -4521,7 +4506,7 @@ setMethod("transform_values",
signature(x = "characterOrColumn", f = "function"),
function(x, f) {
invoke_higher_order_function(
"TransformValues",
"transform_values",
cols = list(x),
funs = list(f)
)
Expand Down Expand Up @@ -4552,7 +4537,7 @@ setMethod("map_zip_with",
signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
function(x, y, f) {
invoke_higher_order_function(
"MapZipWith",
"map_zip_with",
cols = list(x, y),
funs = list(f)
)
Expand Down
5 changes: 5 additions & 0 deletions R/pkg/R/sparkR.R
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,11 @@ sparkR.session <- function(
enableHiveSupport = TRUE,
...) {

if (Sys.getenv("SPARKR_SUPPRESS_DEPRECATION_WARNING") == "") {
warning(
"SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.")
}

sparkConfigMap <- convertNamedListToEnv(sparkConfig)
namedParams <- list(...)
if (length(namedParams) > 0) {
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# R on Spark
# R on Spark (deprecated)

SparkR is an R package that provides a light-weight frontend to use Spark from R.

Expand Down
3 changes: 1 addition & 2 deletions R/pkg/tests/fulltests/test_streaming.R
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ test_that("Unsupported operation", {
# memory sink without aggregation
df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
paste0(".*(start : analysis error - Complete output mode not supported when there ",
"are no streaming aggregations on streaming DataFrames/Datasets).*"))
".*analysis error.*complete.*not supported.*no streaming aggregations*")
})

test_that("Terminated by error", {
Expand Down
2 changes: 2 additions & 0 deletions R/pkg/vignettes/sparkr-vignettes.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt, sep = " "))
```

SparkR is deprecated from Apache Spark 4.0.0 and will be removed in a future version.

## Overview

SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](https://spark.apache.org/mllib/).
Expand Down
4 changes: 2 additions & 2 deletions R/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then
fi

if [ -z "$SPARK_JARS" ]; then
SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
else
SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
SPARKR_SUPPRESS_DEPRECATION_WARNING=1 SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
fi

FAILED=$((PIPESTATUS[0]||$FAILED))
Expand Down
Loading

0 comments on commit e12cd61

Please sign in to comment.