diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index bb9390fd82c2e..782ee58188934 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -75,6 +75,13 @@ More details on parameters can be found in the [Python API documentation](api/py {% include_example python/ml/logistic_regression_with_elastic_net.py %} +
+ +More details on parameters can be found in the [R API documentation](api/R/spark.logit.html). + +{% include_example binomial r/ml/logit.R %} +
+ The `spark.ml` implementation of logistic regression also supports @@ -171,6 +178,13 @@ model with elastic net regularization. {% include_example python/ml/multiclass_logistic_regression_with_elastic_net.py %} +
+ +More details on parameters can be found in the [R API documentation](api/R/spark.logit.html). + +{% include_example multinomial r/ml/logit.R %} +
+ @@ -242,6 +256,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat {% include_example python/ml/random_forest_classifier_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.randomForest.html) for more details. + +{% include_example classification r/ml/randomForest.R %} +
+ ## Gradient-boosted tree classifier @@ -275,6 +297,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat {% include_example python/ml/gradient_boosted_tree_classifier_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.gbt.html) for more details. + +{% include_example classification r/ml/gbt.R %} +
+ ## Multilayer perceptron classifier @@ -324,6 +354,13 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat {% include_example python/ml/multilayer_perceptron_classification.py %} +
+ +Refer to the [R API docs](api/R/spark.mlp.html) for more details. + +{% include_example r/ml/mlp.R %} +
+ @@ -400,7 +437,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat Refer to the [R API docs](api/R/spark.naiveBayes.html) for more details. -{% include_example naiveBayes r/ml.R %} +{% include_example r/ml/naiveBayes.R %} @@ -584,7 +621,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. Refer to the [R API docs](api/R/spark.glm.html) for more details. -{% include_example glm r/ml.R %} +{% include_example r/ml/glm.R %} @@ -656,6 +693,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. {% include_example python/ml/random_forest_regressor_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.randomForest.html) for more details. + +{% include_example regression r/ml/randomForest.R %} +
+ ## Gradient-boosted tree regression @@ -689,6 +734,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. {% include_example python/ml/gradient_boosted_tree_regressor_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.gbt.html) for more details. + +{% include_example regression r/ml/gbt.R %} +
+ @@ -780,7 +833,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. Refer to the [R API docs](api/R/spark.survreg.html) for more details. -{% include_example survreg r/ml.R %} +{% include_example r/ml/survreg.R %} @@ -853,6 +906,14 @@ Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspa {% include_example python/ml/isotonic_regression_example.py %} + +
+ +Refer to the [`IsotonicRegression` R API docs](api/R/spark.isoreg.html) for more details on the API. + +{% include_example r/ml/isoreg.R %} +
+ # Linear methods diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index da23442555aad..d8b6553c5b846 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -91,7 +91,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. Refer to the [R API docs](api/R/spark.kmeans.html) for more details. -{% include_example kmeans r/ml.R %} +{% include_example r/ml/kmeans.R %} @@ -126,6 +126,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. {% include_example python/ml/lda_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.lda.html) for more details. + +{% include_example r/ml/lda.R %} +
+ ## Bisecting k-means @@ -241,4 +249,12 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering. {% include_example python/ml/gaussian_mixture_example.py %} + +
+ +Refer to the [R API docs](api/R/spark.gaussianMixture.html) for more details. + +{% include_example r/ml/gaussianMixture.R %} +
+ diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md index 4d19b4069a1f2..cfe835172ab45 100644 --- a/docs/ml-collaborative-filtering.md +++ b/docs/ml-collaborative-filtering.md @@ -149,4 +149,12 @@ als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, {% endhighlight %} + +
+ +Refer to the [R API docs](api/R/spark.als.html) for more details. + +{% include_example r/ml/als.R %} +
+ diff --git a/docs/sparkr.md b/docs/sparkr.md index 60cd01a9fea71..d2db78282aa8f 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -512,39 +512,33 @@ head(teenagers) # Machine Learning -SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`. -Under the hood, SparkR uses MLlib to train the model. -Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models. -SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. - ## Algorithms -### Generalized Linear Model - -[spark.glm()](api/R/spark.glm.html) or [glm()](api/R/glm.html) fits generalized linear model against a Spark DataFrame. -Currently "gaussian", "binomial", "poisson" and "gamma" families are supported. -{% include_example glm r/ml.R %} - -### Accelerated Failure Time (AFT) Survival Regression Model - -[spark.survreg()](api/R/spark.survreg.html) fits an accelerated failure time (AFT) survival regression model on a SparkDataFrame. -Note that the formula of [spark.survreg()](api/R/spark.survreg.html) does not support operator '.' currently. -{% include_example survreg r/ml.R %} - -### Naive Bayes Model - -[spark.naiveBayes()](api/R/spark.naiveBayes.html) fits a Bernoulli naive Bayes model against a SparkDataFrame. Only categorical data is supported. -{% include_example naiveBayes r/ml.R %} - -### KMeans Model +SparkR supports the following machine learning algorithms currently: + +* [`spark.glm`](api/R/spark.glm.html) or [`glm`](api/R/glm.html): [`Generalized Linear Model`](ml-classification-regression.html#generalized-linear-regression) +* [`spark.survreg`](api/R/spark.survreg.html): [`Accelerated Failure Time (AFT) Survival Regression Model`](ml-classification-regression.html#survival-regression) +* [`spark.naiveBayes`](api/R/spark.naiveBayes.html): [`Naive Bayes Model`](ml-classification-regression.html#naive-bayes) +* [`spark.kmeans`](api/R/spark.kmeans.html): [`K-Means Model`](ml-clustering.html#k-means) +* [`spark.logit`](api/R/spark.logit.html): [`Logistic Regression Model`](ml-classification-regression.html#logistic-regression) +* [`spark.isoreg`](api/R/spark.isoreg.html): [`Isotonic Regression Model`](ml-classification-regression.html#isotonic-regression) +* [`spark.gaussianMixture`](api/R/spark.gaussianMixture.html): [`Gaussian Mixture Model`](ml-clustering.html#gaussian-mixture-model-gmm) +* [`spark.lda`](api/R/spark.lda.html): [`Latent Dirichlet Allocation (LDA) Model`](ml-clustering.html#latent-dirichlet-allocation-lda) +* [`spark.mlp`](api/R/spark.mlp.html): [`Multilayer Perceptron Classification Model`](ml-classification-regression.html#multilayer-perceptron-classifier) +* [`spark.gbt`](api/R/spark.gbt.html): `Gradient Boosted Tree Model for` [`Regression`](ml-classification-regression.html#gradient-boosted-tree-regression) `and` [`Classification`](ml-classification-regression.html#gradient-boosted-tree-classifier) +* [`spark.randomForest`](api/R/spark.randomForest.html): `Random Forest Model for` [`Regression`](ml-classification-regression.html#random-forest-regression) `and` [`Classification`](ml-classification-regression.html#random-forest-classifier) +* [`spark.als`](api/R/spark.als.html): [`Alternating Least Squares (ALS) matrix factorization Model`](ml-collaborative-filtering.html#collaborative-filtering) +* [`spark.kstest`](api/R/spark.kstest.html): `Kolmogorov-Smirnov Test` + +Under the hood, SparkR uses MLlib to train the model. Please refer to the corresponding section of MLlib user guide for example code. +Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models. +SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘. -[spark.kmeans()](api/R/spark.kmeans.html) fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans(). -{% include_example kmeans r/ml.R %} ## Model persistence The following example shows how to save/load a MLlib model by SparkR. -{% include_example read_write r/ml.R %} +{% include_example read_write r/ml/ml.R %} # R Function Name Conflicts diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R deleted file mode 100644 index a8a1274ac902a..0000000000000 --- a/examples/src/main/r/ml.R +++ /dev/null @@ -1,148 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# To run this example use -# ./bin/spark-submit examples/src/main/r/ml.R - -# Load SparkR library into your R session -library(SparkR) - -# Initialize SparkSession -sparkR.session(appName = "SparkR-ML-example") - -############################ spark.glm and glm ############################################## -# $example on:glm$ -irisDF <- suppressWarnings(createDataFrame(iris)) -# Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") - -# Model summary -summary(gaussianGLM) - -# Prediction -gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) -showDF(gaussianPredictions) - -# Fit a generalized linear model with glm (R-compliant) -gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") -summary(gaussianGLM2) - -# Fit a generalized linear model of family "binomial" with spark.glm -binomialDF <- filter(irisDF, irisDF$Species != "setosa") -binomialTestDF <- binomialDF -binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") - -# Model summary -summary(binomialGLM) - -# Prediction -binomialPredictions <- predict(binomialGLM, binomialTestDF) -showDF(binomialPredictions) -# $example off:glm$ -############################ spark.survreg ############################################## -# $example on:survreg$ -# Use the ovarian dataset available in R survival package -library(survival) - -# Fit an accelerated failure time (AFT) survival regression model with spark.survreg -ovarianDF <- suppressWarnings(createDataFrame(ovarian)) -aftDF <- ovarianDF -aftTestDF <- ovarianDF -aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx) - -# Model summary -summary(aftModel) - -# Prediction -aftPredictions <- predict(aftModel, aftTestDF) -showDF(aftPredictions) -# $example off:survreg$ -############################ spark.naiveBayes ############################################## -# $example on:naiveBayes$ -# Fit a Bernoulli naive Bayes model with spark.naiveBayes -titanic <- as.data.frame(Titanic) -titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5]) -nbDF <- titanicDF -nbTestDF <- titanicDF -nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age) - -# Model summary -summary(nbModel) - -# Prediction -nbPredictions <- predict(nbModel, nbTestDF) -showDF(nbPredictions) -# $example off:naiveBayes$ -############################ spark.kmeans ############################################## -# $example on:kmeans$ -# Fit a k-means model with spark.kmeans -irisDF <- suppressWarnings(createDataFrame(iris)) -kmeansDF <- irisDF -kmeansTestDF <- irisDF -kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, - k = 3) - -# Model summary -summary(kmeansModel) - -# Get fitted result from the k-means model -showDF(fitted(kmeansModel)) - -# Prediction -kmeansPredictions <- predict(kmeansModel, kmeansTestDF) -showDF(kmeansPredictions) -# $example off:kmeans$ -############################ model read/write ############################################## -# $example on:read_write$ -irisDF <- suppressWarnings(createDataFrame(iris)) -# Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") - -# Save and then load a fitted MLlib model -modelPath <- tempfile(pattern = "ml", fileext = ".tmp") -write.ml(gaussianGLM, modelPath) -gaussianGLM2 <- read.ml(modelPath) - -# Check model summary -summary(gaussianGLM2) - -# Check model prediction -gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF) -showDF(gaussianPredictions) - -unlink(modelPath) -# $example off:read_write$ -############################ fit models with spark.lapply ##################################### - -# Perform distributed training of multiple models with spark.lapply -families <- c("gaussian", "poisson") -train <- function(family) { - model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family) - summary(model) -} -model.summaries <- spark.lapply(families, train) - -# Print the summary of each model -print(model.summaries) - - -# Stop the SparkSession now -sparkR.session.stop() diff --git a/examples/src/main/r/ml/als.R b/examples/src/main/r/ml/als.R new file mode 100644 index 0000000000000..383bbba1908eb --- /dev/null +++ b/examples/src/main/r/ml/als.R @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/als.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-als-example") + +# $example on$ +# Load training data +data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), + list(1, 2, 4.0), list(2, 1, 1.0), list(2, 2, 5.0)) +df <- createDataFrame(data, c("userId", "movieId", "rating")) +training <- df +test <- df + +# Fit a recommendation model using ALS with spark.als +model <- spark.als(training, maxIter = 5, regParam = 0.01, userCol = "userId", + itemCol = "movieId", ratingCol = "rating") + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off$ diff --git a/examples/src/main/r/ml/gaussianMixture.R b/examples/src/main/r/ml/gaussianMixture.R new file mode 100644 index 0000000000000..54b69acc83d97 --- /dev/null +++ b/examples/src/main/r/ml/gaussianMixture.R @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/gaussianMixture.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-gaussianMixture-example") + +# $example on$ +# Load training data +df <- read.df("data/mllib/sample_kmeans_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a gaussian mixture clustering model with spark.gaussianMixture +model <- spark.gaussianMixture(training, ~ features, k = 2) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off$ diff --git a/examples/src/main/r/ml/gbt.R b/examples/src/main/r/ml/gbt.R new file mode 100644 index 0000000000000..be16c2aa66330 --- /dev/null +++ b/examples/src/main/r/ml/gbt.R @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/gbt.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-gbt-example") + +# GBT classification model + +# $example on:classification$ +# Load training data +df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a GBT classification model with spark.gbt +model <- spark.gbt(training, label ~ features, "classification", maxIter = 10) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:classification$ + +# GBT regression model + +# $example on:regression$ +# Load training data +df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a GBT regression model with spark.gbt +model <- spark.gbt(training, label ~ features, "regression", maxIter = 10) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:regression$ diff --git a/examples/src/main/r/ml/glm.R b/examples/src/main/r/ml/glm.R new file mode 100644 index 0000000000000..599071790a2c3 --- /dev/null +++ b/examples/src/main/r/ml/glm.R @@ -0,0 +1,57 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/glm.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-glm-example") + +# $example on$ +irisDF <- suppressWarnings(createDataFrame(iris)) +# Fit a generalized linear model of family "gaussian" with spark.glm +gaussianDF <- irisDF +gaussianTestDF <- irisDF +gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") + +# Model summary +summary(gaussianGLM) + +# Prediction +gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) +showDF(gaussianPredictions) + +# Fit a generalized linear model with glm (R-compliant) +gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") +summary(gaussianGLM2) + +# Fit a generalized linear model of family "binomial" with spark.glm +# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family. +binomialDF <- filter(irisDF, irisDF$Species != "setosa") +binomialTestDF <- binomialDF +binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") + +# Model summary +summary(binomialGLM) + +# Prediction +binomialPredictions <- predict(binomialGLM, binomialTestDF) +showDF(binomialPredictions) +# $example off$ diff --git a/examples/src/main/r/ml/isoreg.R b/examples/src/main/r/ml/isoreg.R new file mode 100644 index 0000000000000..75dce97ed9931 --- /dev/null +++ b/examples/src/main/r/ml/isoreg.R @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/isoreg.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-isoreg-example") + +# $example on$ +# Load training data +df <- read.df("data/mllib/sample_isotonic_regression_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit an isotonic regression model with spark.isoreg +model <- spark.isoreg(training, label ~ features, isotonic = FALSE) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off$ diff --git a/examples/src/main/r/ml/kmeans.R b/examples/src/main/r/ml/kmeans.R new file mode 100644 index 0000000000000..043b21b0385d7 --- /dev/null +++ b/examples/src/main/r/ml/kmeans.R @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/kmeans.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-kmeans-example") + +# $example on$ +# Fit a k-means model with spark.kmeans +irisDF <- suppressWarnings(createDataFrame(iris)) +kmeansDF <- irisDF +kmeansTestDF <- irisDF +kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, + k = 3) + +# Model summary +summary(kmeansModel) + +# Get fitted result from the k-means model +showDF(fitted(kmeansModel)) + +# Prediction +kmeansPredictions <- predict(kmeansModel, kmeansTestDF) +showDF(kmeansPredictions) +# $example off$ diff --git a/examples/src/main/r/ml/kstest.R b/examples/src/main/r/ml/kstest.R new file mode 100644 index 0000000000000..12625f7d3e635 --- /dev/null +++ b/examples/src/main/r/ml/kstest.R @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/kstest.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-kstest-example") + +# $example on$ +# Load training data +data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5)) +df <- createDataFrame(data) +training <- df +test <- df + +# Conduct the two-sided Kolmogorov-Smirnov (KS) test with spark.kstest +model <- spark.kstest(df, "test", "norm") + +# Model summary +summary(model) +# $example off$ diff --git a/examples/src/main/r/ml/lda.R b/examples/src/main/r/ml/lda.R new file mode 100644 index 0000000000000..7b187d155a4cb --- /dev/null +++ b/examples/src/main/r/ml/lda.R @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/lda.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-lda-example") + +# $example on$ +# Load training data +df <- read.df("data/mllib/sample_lda_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a latent dirichlet allocation model with spark.lda +model <- spark.lda(training, k = 10, maxIter = 10) + +# Model summary +summary(model) + +# Posterior probabilities +posterior <- spark.posterior(model, test) +showDF(posterior) + +# The log perplexity of the LDA model +logPerplexity <- spark.perplexity(model, test) +print(paste0("The upper bound bound on perplexity: ", logPerplexity)) +# $example off$ diff --git a/examples/src/main/r/ml/logit.R b/examples/src/main/r/ml/logit.R new file mode 100644 index 0000000000000..a2ac882ed022c --- /dev/null +++ b/examples/src/main/r/ml/logit.R @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/logit.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-logit-example") + +# Binomial logistic regression + +# $example on:binomial$ +# Load training data +df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit an binomial logistic regression model with spark.logit +model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:binomial$ + +# Multinomial logistic regression + +# $example on:multinomial$ +# Load training data +df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a multinomial logistic regression model with spark.logit +model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:multinomial$ diff --git a/examples/src/main/r/ml/ml.R b/examples/src/main/r/ml/ml.R new file mode 100644 index 0000000000000..d601590c22a89 --- /dev/null +++ b/examples/src/main/r/ml/ml.R @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/ml.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-example") + +############################ model read/write ############################################## +# $example on:read_write$ +irisDF <- suppressWarnings(createDataFrame(iris)) +# Fit a generalized linear model of family "gaussian" with spark.glm +gaussianDF <- irisDF +gaussianTestDF <- irisDF +gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") + +# Save and then load a fitted MLlib model +modelPath <- tempfile(pattern = "ml", fileext = ".tmp") +write.ml(gaussianGLM, modelPath) +gaussianGLM2 <- read.ml(modelPath) + +# Check model summary +summary(gaussianGLM2) + +# Check model prediction +gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF) +showDF(gaussianPredictions) + +unlink(modelPath) +# $example off:read_write$ + +############################ fit models with spark.lapply ##################################### +# Perform distributed training of multiple models with spark.lapply +costs <- exp(seq(from = log(1), to = log(1000), length.out = 5)) +train <- function(cost) { + stopifnot(requireNamespace("e1071", quietly = TRUE)) + model <- e1071::svm(Species ~ ., data = iris, cost = cost) + summary(model) +} + +model.summaries <- spark.lapply(costs, train) + +# Print the summary of each model +print(model.summaries) + +# Stop the SparkSession now +sparkR.session.stop() diff --git a/examples/src/main/r/ml/mlp.R b/examples/src/main/r/ml/mlp.R new file mode 100644 index 0000000000000..d28fc069bd118 --- /dev/null +++ b/examples/src/main/r/ml/mlp.R @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/mlp.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-mlp-example") + +# $example on$ +# Load training data +df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") +training <- df +test <- df + +# specify layers for the neural network: +# input layer of size 4 (features), two intermediate of size 5 and 4 +# and output of size 3 (classes) +layers = c(4, 5, 4, 3) + +# Fit a multi-layer perceptron neural network model with spark.mlp +model <- spark.mlp(training, label ~ features, maxIter = 100, + layers = layers, blockSize = 128, seed = 1234) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off$ diff --git a/examples/src/main/r/ml/naiveBayes.R b/examples/src/main/r/ml/naiveBayes.R new file mode 100644 index 0000000000000..9c416599b4d78 --- /dev/null +++ b/examples/src/main/r/ml/naiveBayes.R @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/naiveBayes.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-naiveBayes-example") + +# $example on$ +# Fit a Bernoulli naive Bayes model with spark.naiveBayes +titanic <- as.data.frame(Titanic) +titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5]) +nbDF <- titanicDF +nbTestDF <- titanicDF +nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age) + +# Model summary +summary(nbModel) + +# Prediction +nbPredictions <- predict(nbModel, nbTestDF) +showDF(nbPredictions) +# $example off$ diff --git a/examples/src/main/r/ml/randomForest.R b/examples/src/main/r/ml/randomForest.R new file mode 100644 index 0000000000000..d1b96b62a0e3b --- /dev/null +++ b/examples/src/main/r/ml/randomForest.R @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/randomForest.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-randomForest-example") + +# Random forest classification model + +# $example on:classification$ +# Load training data +df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a random forest classification model with spark.randomForest +model <- spark.randomForest(training, label ~ features, "classification", numTrees = 10) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:classification$ + +# Random forest regression model + +# $example on:regression$ +# Load training data +df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a random forest regression model with spark.randomForest +model <- spark.randomForest(training, label ~ features, "regression", numTrees = 10) + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +showDF(predictions) +# $example off:regression$ diff --git a/examples/src/main/r/ml/survreg.R b/examples/src/main/r/ml/survreg.R new file mode 100644 index 0000000000000..f728b8b5d8c06 --- /dev/null +++ b/examples/src/main/r/ml/survreg.R @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/survreg.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-survreg-example") + +# $example on$ +# Use the ovarian dataset available in R survival package +library(survival) + +# Fit an accelerated failure time (AFT) survival regression model with spark.survreg +ovarianDF <- suppressWarnings(createDataFrame(ovarian)) +aftDF <- ovarianDF +aftTestDF <- ovarianDF +aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx) + +# Model summary +summary(aftModel) + +# Prediction +aftPredictions <- predict(aftModel, aftTestDF) +showDF(aftPredictions) +# $example off$