From fabf1749995103841e6a3975892572f376ee48d0 Mon Sep 17 00:00:00 2001 From: Martin Jaggi Date: Sat, 8 Feb 2014 11:39:13 -0800 Subject: [PATCH 001/127] Merge pull request #552 from martinjaggi/master. Closes #552. tex formulas in the documentation using mathjax. and spliting the MLlib documentation by techniques see jira https://spark-project.atlassian.net/browse/MLLIB-19 and https://github.com/shivaram/spark/compare/mathjax Author: Martin Jaggi == Merge branch commits == commit 0364bfabbfc347f917216057a20c39b631842481 Author: Martin Jaggi Date: Fri Feb 7 03:19:38 2014 +0100 minor polishing, as suggested by @pwendell commit dcd2142c164b2f602bf472bb152ad55bae82d31a Author: Martin Jaggi Date: Thu Feb 6 18:04:26 2014 +0100 enabling inline latex formulas with $.$ same mathjax configuration as used in math.stackexchange.com sample usage in the linear algebra (SVD) documentation commit bbafafd2b497a5acaa03a140bb9de1fbb7d67ffa Author: Martin Jaggi Date: Thu Feb 6 17:31:29 2014 +0100 split MLlib documentation by techniques and linked from the main mllib-guide.md site commit d1c5212b93c67436543c2d8ddbbf610fdf0a26eb Author: Martin Jaggi Date: Thu Feb 6 16:59:43 2014 +0100 enable mathjax formula in the .md documentation files code by @shivaram commit d73948db0d9bc36296054e79fec5b1a657b4eab4 Author: Martin Jaggi Date: Thu Feb 6 16:57:23 2014 +0100 minor update on how to compile the documentation --- docs/README.md | 4 +- docs/_layouts/global.html | 13 + docs/css/main.css | 8 + docs/mllib-classification-regression.md | 206 ++++++++++ docs/mllib-clustering.md | 106 +++++ docs/mllib-collaborative-filtering.md | 130 +++++++ docs/mllib-guide.md | 490 +----------------------- docs/mllib-linear-algebra.md | 61 +++ docs/mllib-optimization.md | 40 ++ 9 files changed, 586 insertions(+), 472 deletions(-) create mode 100644 docs/mllib-classification-regression.md create mode 100644 docs/mllib-clustering.md create mode 100644 docs/mllib-collaborative-filtering.md create mode 100644 docs/mllib-linear-algebra.md create mode 100644 docs/mllib-optimization.md diff --git a/docs/README.md b/docs/README.md index dfcf7535538f0..cc09d6e88f41e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,9 +10,9 @@ We include the Spark documentation as part of the source (as opposed to using a In this directory you will find textfiles formatted using Markdown, with an ".md" suffix. You can read those text files directly if you want. Start with index.md. -To make things quite a bit prettier and make the links easier to follow, generate the html version of the documentation based on the src directory by running `jekyll` in the docs directory. Use the command `SKIP_SCALADOC=1 jekyll` to skip building and copying over the scaladoc which can be timely. To use the `jekyll` command, you will need to have Jekyll installed, the easiest way to do this is via a Ruby Gem, see the [jekyll installation instructions](https://github.com/mojombo/jekyll/wiki/install). This will create a directory called _site containing index.html as well as the rest of the compiled files. Read more about Jekyll at https://github.com/mojombo/jekyll/wiki. +To make things quite a bit prettier and make the links easier to follow, generate the html version of the documentation based on the src directory by running `jekyll build` in the docs directory. Use the command `SKIP_SCALADOC=1 jekyll build` to skip building and copying over the scaladoc which can be timely. To use the `jekyll` command, you will need to have Jekyll installed, the easiest way to do this is via a Ruby Gem, see the [jekyll installation instructions](http://jekyllrb.com/docs/installation). This will create a directory called _site containing index.html as well as the rest of the compiled files. Read more about Jekyll at https://github.com/mojombo/jekyll/wiki. -In addition to generating the site as html from the markdown files, jekyll can serve up the site via a webserver. To build and run a webserver use the command `jekyll --server` which (currently) runs the webserver on port 4000, then visit the site at http://localhost:4000. +In addition to generating the site as html from the markdown files, jekyll can serve up the site via a webserver. To build and run a local webserver use the command `jekyll serve` (or the faster variant `SKIP_SCALADOC=1 jekyll serve`), which runs the webserver on port 4000, then visit the site at http://localhost:4000. ## Pygments diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 33525953ac4f6..b65686c0b1bb4 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -195,4 +195,17 @@

Heading

+ + + diff --git a/docs/css/main.css b/docs/css/main.css index 8566400f071c9..f6fe7d5f07da1 100755 --- a/docs/css/main.css +++ b/docs/css/main.css @@ -138,3 +138,11 @@ ul.nav li.dropdown ul.dropdown-menu li.dropdown-submenu ul.dropdown-menu { .nav-tabs > li > a, .nav-tabs > li > a:hover { color: #333; } + +/** + * MathJax (embedded latex formulas) + */ +.MathJax .mo { color: inherit } +.MathJax .mi { color: inherit } +.MathJax .mf { color: inherit } +.MathJax .mh { color: inherit } diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md new file mode 100644 index 0000000000000..edb93389079d2 --- /dev/null +++ b/docs/mllib-classification-regression.md @@ -0,0 +1,206 @@ +--- +layout: global +title: MLlib - Classification and Regression +--- + +* Table of contents +{:toc} + + +# Binary Classification + +Binary classification is a supervised learning problem in which we want to +classify entities into one of two distinct categories or labels, e.g., +predicting whether or not emails are spam. This problem involves executing a +learning *Algorithm* on a set of *labeled* examples, i.e., a set of entities +represented via (numerical) features along with underlying category labels. +The algorithm returns a trained *Model* that can predict the label for new +entities for which the underlying label is unknown. + +MLlib currently supports two standard model families for binary classification, +namely [Linear Support Vector Machines +(SVMs)](http://en.wikipedia.org/wiki/Support_vector_machine) and [Logistic +Regression](http://en.wikipedia.org/wiki/Logistic_regression), along with [L1 +and L2 regularized](http://en.wikipedia.org/wiki/Regularization_(mathematics)) +variants of each model family. The training algorithms all leverage an +underlying gradient descent primitive (described +[below](#gradient-descent-primitive)), and take as input a regularization +parameter (*regParam*) along with various parameters associated with gradient +descent (*stepSize*, *numIterations*, *miniBatchFraction*). + +Available algorithms for binary classification: + +* [SVMWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.SVMWithSGD) +* [LogisticRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD) + +# Linear Regression + +Linear regression is another classical supervised learning setting. In this +problem, each entity is associated with a real-valued label (as opposed to a +binary label as in binary classification), and we want to predict labels as +closely as possible given numerical features representing entities. MLlib +supports linear regression as well as L1 +([lasso](http://en.wikipedia.org/wiki/Lasso_(statistics)#Lasso_method)) and L2 +([ridge](http://en.wikipedia.org/wiki/Ridge_regression)) regularized variants. +The regression algorithms in MLlib also leverage the underlying gradient +descent primitive (described [below](#gradient-descent-primitive)), and have +the same parameters as the binary classification algorithms described above. + +Available algorithms for linear regression: + +* [LinearRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD) +* [RidgeRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD) +* [LassoWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LassoWithSGD) + +Behind the scenes, all above methods use the SGD implementation from the +gradient descent primitive in MLlib, see the +optimization part: + +* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent) + + +# Usage in Scala + +Following code snippets can be executed in `spark-shell`. + +## Binary Classification + +The following code snippet illustrates how to load a sample dataset, execute a +training algorithm on this training data using a static method in the algorithm +object, and make predictions with the resulting model to compute the training +error. + +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.classification.SVMWithSGD +import org.apache.spark.mllib.regression.LabeledPoint + +// Load and parse the data file +val data = sc.textFile("mllib/data/sample_svm_data.txt") +val parsedData = data.map { line => + val parts = line.split(' ') + LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray) +} + +// Run training algorithm to build the model +val numIterations = 20 +val model = SVMWithSGD.train(parsedData, numIterations) + +// Evaluate model on training examples and compute training error +val labelAndPreds = parsedData.map { point => + val prediction = model.predict(point.features) + (point.label, prediction) +} +val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count +println("Training Error = " + trainErr) +{% endhighlight %} + + +The `SVMWithSGD.train()` method by default performs L2 regularization with the +regularization parameter set to 1.0. If we want to configure this algorithm, we +can customize `SVMWithSGD` further by creating a new object directly and +calling setter methods. All other MLlib algorithms support customization in +this way as well. For example, the following code produces an L1 regularized +variant of SVMs with regularization parameter set to 0.1, and runs the training +algorithm for 200 iterations. + +{% highlight scala %} +import org.apache.spark.mllib.optimization.L1Updater + +val svmAlg = new SVMWithSGD() +svmAlg.optimizer.setNumIterations(200) + .setRegParam(0.1) + .setUpdater(new L1Updater) +val modelL1 = svmAlg.run(parsedData) +{% endhighlight %} + +## Linear Regression +The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The +example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We +compute the Mean Squared Error at the end to evaluate +[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit) + +{% highlight scala %} +import org.apache.spark.mllib.regression.LinearRegressionWithSGD +import org.apache.spark.mllib.regression.LabeledPoint + +// Load and parse the data +val data = sc.textFile("mllib/data/ridge-data/lpsa.data") +val parsedData = data.map { line => + val parts = line.split(',') + LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray) +} + +// Building the model +val numIterations = 20 +val model = LinearRegressionWithSGD.train(parsedData, numIterations) + +// Evaluate model on training examples and compute training error +val valuesAndPreds = parsedData.map { point => + val prediction = model.predict(point.features) + (point.label, prediction) +} +val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count +println("training Mean Squared Error = " + MSE) +{% endhighlight %} + + +Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training +[Mean Squared Errors](http://en.wikipedia.org/wiki/Mean_squared_error). + + +# Usage in Java + +All of MLlib's methods use Java-friendly types, so you can import and call them there the same +way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the +Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by +calling `.rdd()` on your `JavaRDD` object. + +# Usage in Python +Following examples can be tested in the PySpark shell. + +## Binary Classification +The following example shows how to load a sample dataset, build Logistic Regression model, +and make predictions with the resulting model to compute the training error. + +{% highlight python %} +from pyspark.mllib.classification import LogisticRegressionWithSGD +from numpy import array + +# Load and parse the data +data = sc.textFile("mllib/data/sample_svm_data.txt") +parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) +model = LogisticRegressionWithSGD.train(parsedData) + +# Build the model +labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)), + model.predict(point.take(range(1, point.size))))) + +# Evaluating the model on training data +trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) +print("Training Error = " + str(trainErr)) +{% endhighlight %} + +## Linear Regression +The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The +example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We +compute the Mean Squared Error at the end to evaluate +[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit) + +{% highlight python %} +from pyspark.mllib.regression import LinearRegressionWithSGD +from numpy import array + +# Load and parse the data +data = sc.textFile("mllib/data/ridge-data/lpsa.data") +parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')])) + +# Build the model +model = LinearRegressionWithSGD.train(parsedData) + +# Evaluate the model on training data +valuesAndPreds = parsedData.map(lambda point: (point.item(0), + model.predict(point.take(range(1, point.size))))) +MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count() +print("Mean Squared Error = " + str(MSE)) +{% endhighlight %} diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md new file mode 100644 index 0000000000000..65ed75b82ea5b --- /dev/null +++ b/docs/mllib-clustering.md @@ -0,0 +1,106 @@ +--- +layout: global +title: MLlib - Clustering +--- + +* Table of contents +{:toc} + + +# Clustering + +Clustering is an unsupervised learning problem whereby we aim to group subsets +of entities with one another based on some notion of similarity. Clustering is +often used for exploratory analysis and/or as a component of a hierarchical +supervised learning pipeline (in which distinct classifiers or regression +models are trained for each cluster). MLlib supports +[k-means](http://en.wikipedia.org/wiki/K-means_clustering) clustering, one of +the most commonly used clustering algorithms that clusters the data points into +predfined number of clusters. The MLlib implementation includes a parallelized +variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method +called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf). +The implementation in MLlib has the following parameters: + +* *k* is the number of desired clusters. +* *maxIterations* is the maximum number of iterations to run. +* *initializationMode* specifies either random initialization or +initialization via k-means\|\|. +* *runs* is the number of times to run the k-means algorithm (k-means is not +guaranteed to find a globally optimal solution, and when run multiple times on +a given dataset, the algorithm returns the best clustering result). +* *initializiationSteps* determines the number of steps in the k-means\|\| algorithm. +* *epsilon* determines the distance threshold within which we consider k-means to have converged. + +Available algorithms for clustering: + +* [KMeans](api/mllib/index.html#org.apache.spark.mllib.clustering.KMeans) + + + +# Usage in Scala + +Following code snippets can be executed in `spark-shell`. + +In the following example after loading and parsing data, we use the KMeans object to cluster the data +into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within +Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the +optimal *k* is usually one where there is an "elbow" in the WSSSE graph. + +{% highlight scala %} +import org.apache.spark.mllib.clustering.KMeans + +// Load and parse the data +val data = sc.textFile("kmeans_data.txt") +val parsedData = data.map( _.split(' ').map(_.toDouble)) + +// Cluster the data into two classes using KMeans +val numIterations = 20 +val numClusters = 2 +val clusters = KMeans.train(parsedData, numClusters, numIterations) + +// Evaluate clustering by computing Within Set Sum of Squared Errors +val WSSSE = clusters.computeCost(parsedData) +println("Within Set Sum of Squared Errors = " + WSSSE) +{% endhighlight %} + + +# Usage in Java + +All of MLlib's methods use Java-friendly types, so you can import and call them there the same +way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the +Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by +calling `.rdd()` on your `JavaRDD` object. + +# Usage in Python +Following examples can be tested in the PySpark shell. + +In the following example after loading and parsing data, we use the KMeans object to cluster the data +into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within +Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the +optimal *k* is usually one where there is an "elbow" in the WSSSE graph. + +{% highlight python %} +from pyspark.mllib.clustering import KMeans +from numpy import array +from math import sqrt + +# Load and parse the data +data = sc.textFile("kmeans_data.txt") +parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) + +# Build the model (cluster the data) +clusters = KMeans.train(parsedData, 2, maxIterations=10, + runs=30, initialization_mode="random") + +# Evaluate clustering by computing Within Set Sum of Squared Errors +def error(point): + center = clusters.centers[clusters.predict(point)] + return sqrt(sum([x**2 for x in (point - center)])) + +WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) +print("Within Set Sum of Squared Error = " + str(WSSSE)) +{% endhighlight %} + +Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training Mean Squared +Errors. + diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md new file mode 100644 index 0000000000000..aa22f67b303ce --- /dev/null +++ b/docs/mllib-collaborative-filtering.md @@ -0,0 +1,130 @@ +--- +layout: global +title: MLlib - Collaborative Filtering +--- + +* Table of contents +{:toc} + +# Collaborative Filtering + +[Collaborative filtering](http://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) +is commonly used for recommender systems. These techniques aim to fill in the +missing entries of a user-item association matrix. MLlib currently supports +model-based collaborative filtering, in which users and products are described +by a small set of latent factors that can be used to predict missing entries. +In particular, we implement the [alternating least squares +(ALS)](http://www2.research.att.com/~volinsky/papers/ieeecomputer.pdf) +algorithm to learn these latent factors. The implementation in MLlib has the +following parameters: + +* *numBlocks* is the number of blacks used to parallelize computation (set to -1 to auto-configure). +* *rank* is the number of latent factors in our model. +* *iterations* is the number of iterations to run. +* *lambda* specifies the regularization parameter in ALS. +* *implicitPrefs* specifies whether to use the *explicit feedback* ALS variant or one adapted for *implicit feedback* data +* *alpha* is a parameter applicable to the implicit feedback variant of ALS that governs the *baseline* confidence in preference observations + +## Explicit vs Implicit Feedback + +The standard approach to matrix factorization based collaborative filtering treats +the entries in the user-item matrix as *explicit* preferences given by the user to the item. + +It is common in many real-world use cases to only have access to *implicit feedback* +(e.g. views, clicks, purchases, likes, shares etc.). The approach used in MLlib to deal with +such data is taken from +[Collaborative Filtering for Implicit Feedback Datasets](http://www2.research.att.com/~yifanhu/PUB/cf.pdf). +Essentially instead of trying to model the matrix of ratings directly, this approach treats the data as +a combination of binary preferences and *confidence values*. The ratings are then related +to the level of confidence in observed user preferences, rather than explicit ratings given to items. +The model then tries to find latent factors that can be used to predict the expected preference of a user +for an item. + +Available algorithms for collaborative filtering: + +* [ALS](api/mllib/index.html#org.apache.spark.mllib.recommendation.ALS) + + +# Usage in Scala + +Following code snippets can be executed in `spark-shell`. + +In the following example we load rating data. Each row consists of a user, a product and a rating. +We use the default ALS.train() method which assumes ratings are explicit. We evaluate the recommendation +model by measuring the Mean Squared Error of rating prediction. + +{% highlight scala %} +import org.apache.spark.mllib.recommendation.ALS +import org.apache.spark.mllib.recommendation.Rating + +// Load and parse the data +val data = sc.textFile("mllib/data/als/test.data") +val ratings = data.map(_.split(',') match { + case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) +}) + +// Build the recommendation model using ALS +val numIterations = 20 +val model = ALS.train(ratings, 1, 20, 0.01) + +// Evaluate the model on rating data +val usersProducts = ratings.map{ case Rating(user, product, rate) => (user, product)} +val predictions = model.predict(usersProducts).map{ + case Rating(user, product, rate) => ((user, product), rate) +} +val ratesAndPreds = ratings.map{ + case Rating(user, product, rate) => ((user, product), rate) +}.join(predictions) +val MSE = ratesAndPreds.map{ + case ((user, product), (r1, r2)) => math.pow((r1- r2), 2) +}.reduce(_ + _)/ratesAndPreds.count +println("Mean Squared Error = " + MSE) +{% endhighlight %} + +If the rating matrix is derived from other source of information (i.e., it is inferred from +other signals), you can use the trainImplicit method to get better results. + +{% highlight scala %} +val model = ALS.trainImplicit(ratings, 1, 20, 0.01) +{% endhighlight %} + +# Usage in Java + +All of MLlib's methods use Java-friendly types, so you can import and call them there the same +way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the +Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by +calling `.rdd()` on your `JavaRDD` object. + +# Usage in Python +Following examples can be tested in the PySpark shell. + +In the following example we load rating data. Each row consists of a user, a product and a rating. +We use the default ALS.train() method which assumes ratings are explicit. We evaluate the +recommendation by measuring the Mean Squared Error of rating prediction. + +{% highlight python %} +from pyspark.mllib.recommendation import ALS +from numpy import array + +# Load and parse the data +data = sc.textFile("mllib/data/als/test.data") +ratings = data.map(lambda line: array([float(x) for x in line.split(',')])) + +# Build the recommendation model using Alternating Least Squares +model = ALS.train(ratings, 1, 20) + +# Evaluate the model on training data +testdata = ratings.map(lambda p: (int(p[0]), int(p[1]))) +predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) +ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) +MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count() +print("Mean Squared Error = " + str(MSE)) +{% endhighlight %} + +If the rating matrix is derived from other source of information (i.e., it is inferred from other +signals), you can use the trainImplicit method to get better results. + +{% highlight python %} +# Build the recommendation model using Alternating Least Squares based on implicit ratings +model = ALS.trainImplicit(ratings, 1, 20) +{% endhighlight %} diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 0e34da4ec4749..76308ec9c0821 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -3,16 +3,32 @@ layout: global title: Machine Learning Library (MLlib) --- -* Table of contents -{:toc} MLlib is a Spark implementation of some common machine learning (ML) functionality, as well associated tests and data generators. MLlib currently supports four common types of machine learning problem settings, namely, binary classification, regression, clustering and collaborative filtering, as well as an underlying gradient descent optimization primitive. -This guide will outline the functionality supported in MLlib and also provides -an example of invoking MLlib. + +# Available Methods +The following links provide a detailed explanation of the methods and usage examples for each of them: + +* Classification and Regression + * Binary Classification + * SVM (L1 and L2 regularized) + * Logistic Regression (L1 and L2 regularized) + * Linear Regression + * Least Squares + * Lasso + * Ridge Regression +* Clustering + * k-Means +* Collaborative Filtering + * Matrix Factorization using Alternating Least Squares +* Optimization + * Gradient Descent and Stochastic Gradient Descent +* Linear Algebra + * Singular Value Decomposition # Dependencies MLlib uses the [jblas](https://github.com/mikiobraun/jblas) linear algebra library, which itself @@ -24,469 +40,3 @@ detect these libraries automatically. To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer and Python 2.7. -# Binary Classification - -Binary classification is a supervised learning problem in which we want to -classify entities into one of two distinct categories or labels, e.g., -predicting whether or not emails are spam. This problem involves executing a -learning *Algorithm* on a set of *labeled* examples, i.e., a set of entities -represented via (numerical) features along with underlying category labels. -The algorithm returns a trained *Model* that can predict the label for new -entities for which the underlying label is unknown. - -MLlib currently supports two standard model families for binary classification, -namely [Linear Support Vector Machines -(SVMs)](http://en.wikipedia.org/wiki/Support_vector_machine) and [Logistic -Regression](http://en.wikipedia.org/wiki/Logistic_regression), along with [L1 -and L2 regularized](http://en.wikipedia.org/wiki/Regularization_(mathematics)) -variants of each model family. The training algorithms all leverage an -underlying gradient descent primitive (described -[below](#gradient-descent-primitive)), and take as input a regularization -parameter (*regParam*) along with various parameters associated with gradient -descent (*stepSize*, *numIterations*, *miniBatchFraction*). - -Available algorithms for binary classification: - -* [SVMWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.SVMWithSGD) -* [LogisticRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithSGD) - -# Linear Regression - -Linear regression is another classical supervised learning setting. In this -problem, each entity is associated with a real-valued label (as opposed to a -binary label as in binary classification), and we want to predict labels as -closely as possible given numerical features representing entities. MLlib -supports linear regression as well as L1 -([lasso](http://en.wikipedia.org/wiki/Lasso_(statistics)#Lasso_method)) and L2 -([ridge](http://en.wikipedia.org/wiki/Ridge_regression)) regularized variants. -The regression algorithms in MLlib also leverage the underlying gradient -descent primitive (described [below](#gradient-descent-primitive)), and have -the same parameters as the binary classification algorithms described above. - -Available algorithms for linear regression: - -* [LinearRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LinearRegressionWithSGD) -* [RidgeRegressionWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD) -* [LassoWithSGD](api/mllib/index.html#org.apache.spark.mllib.regression.LassoWithSGD) - -# Clustering - -Clustering is an unsupervised learning problem whereby we aim to group subsets -of entities with one another based on some notion of similarity. Clustering is -often used for exploratory analysis and/or as a component of a hierarchical -supervised learning pipeline (in which distinct classifiers or regression -models are trained for each cluster). MLlib supports -[k-means](http://en.wikipedia.org/wiki/K-means_clustering) clustering, one of -the most commonly used clustering algorithms that clusters the data points into -predfined number of clusters. The MLlib implementation includes a parallelized -variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method -called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf). -The implementation in MLlib has the following parameters: - -* *k* is the number of desired clusters. -* *maxIterations* is the maximum number of iterations to run. -* *initializationMode* specifies either random initialization or -initialization via k-means\|\|. -* *runs* is the number of times to run the k-means algorithm (k-means is not -guaranteed to find a globally optimal solution, and when run multiple times on -a given dataset, the algorithm returns the best clustering result). -* *initializiationSteps* determines the number of steps in the k-means\|\| algorithm. -* *epsilon* determines the distance threshold within which we consider k-means to have converged. - -Available algorithms for clustering: - -* [KMeans](api/mllib/index.html#org.apache.spark.mllib.clustering.KMeans) - -# Collaborative Filtering - -[Collaborative filtering](http://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) -is commonly used for recommender systems. These techniques aim to fill in the -missing entries of a user-item association matrix. MLlib currently supports -model-based collaborative filtering, in which users and products are described -by a small set of latent factors that can be used to predict missing entries. -In particular, we implement the [alternating least squares -(ALS)](http://www2.research.att.com/~volinsky/papers/ieeecomputer.pdf) -algorithm to learn these latent factors. The implementation in MLlib has the -following parameters: - -* *numBlocks* is the number of blacks used to parallelize computation (set to -1 to auto-configure). -* *rank* is the number of latent factors in our model. -* *iterations* is the number of iterations to run. -* *lambda* specifies the regularization parameter in ALS. -* *implicitPrefs* specifies whether to use the *explicit feedback* ALS variant or one adapted for *implicit feedback* data -* *alpha* is a parameter applicable to the implicit feedback variant of ALS that governs the *baseline* confidence in preference observations - -## Explicit vs Implicit Feedback - -The standard approach to matrix factorization based collaborative filtering treats -the entries in the user-item matrix as *explicit* preferences given by the user to the item. - -It is common in many real-world use cases to only have access to *implicit feedback* -(e.g. views, clicks, purchases, likes, shares etc.). The approach used in MLlib to deal with -such data is taken from -[Collaborative Filtering for Implicit Feedback Datasets](http://www2.research.att.com/~yifanhu/PUB/cf.pdf). -Essentially instead of trying to model the matrix of ratings directly, this approach treats the data as -a combination of binary preferences and *confidence values*. The ratings are then related -to the level of confidence in observed user preferences, rather than explicit ratings given to items. -The model then tries to find latent factors that can be used to predict the expected preference of a user -for an item. - -Available algorithms for collaborative filtering: - -* [ALS](api/mllib/index.html#org.apache.spark.mllib.recommendation.ALS) - -# Gradient Descent Primitive - -[Gradient descent](http://en.wikipedia.org/wiki/Gradient_descent) (along with -stochastic variants thereof) are first-order optimization methods that are -well-suited for large-scale and distributed computation. Gradient descent -methods aim to find a local minimum of a function by iteratively taking steps -in the direction of the negative gradient of the function at the current point, -i.e., the current parameter value. Gradient descent is included as a low-level -primitive in MLlib, upon which various ML algorithms are developed, and has the -following parameters: - -* *gradient* is a class that computes the stochastic gradient of the function -being optimized, i.e., with respect to a single training example, at the -current parameter value. MLlib includes gradient classes for common loss -functions, e.g., hinge, logistic, least-squares. The gradient class takes as -input a training example, its label, and the current parameter value. -* *updater* is a class that updates weights in each iteration of gradient -descent. MLlib includes updaters for cases without regularization, as well as -L1 and L2 regularizers. -* *stepSize* is a scalar value denoting the initial step size for gradient -descent. All updaters in MLlib use a step size at the t-th step equal to -stepSize / sqrt(t). -* *numIterations* is the number of iterations to run. -* *regParam* is the regularization parameter when using L1 or L2 regularization. -* *miniBatchFraction* is the fraction of the data used to compute the gradient -at each iteration. - -Available algorithms for gradient descent: - -* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent) - -# Using MLLib in Scala - -Following code snippets can be executed in `spark-shell`. - -## Binary Classification - -The following code snippet illustrates how to load a sample dataset, execute a -training algorithm on this training data using a static method in the algorithm -object, and make predictions with the resulting model to compute the training -error. - -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.classification.SVMWithSGD -import org.apache.spark.mllib.regression.LabeledPoint - -// Load and parse the data file -val data = sc.textFile("mllib/data/sample_svm_data.txt") -val parsedData = data.map { line => - val parts = line.split(' ') - LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray) -} - -// Run training algorithm to build the model -val numIterations = 20 -val model = SVMWithSGD.train(parsedData, numIterations) - -// Evaluate model on training examples and compute training error -val labelAndPreds = parsedData.map { point => - val prediction = model.predict(point.features) - (point.label, prediction) -} -val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count -println("Training Error = " + trainErr) -{% endhighlight %} - - -The `SVMWithSGD.train()` method by default performs L2 regularization with the -regularization parameter set to 1.0. If we want to configure this algorithm, we -can customize `SVMWithSGD` further by creating a new object directly and -calling setter methods. All other MLlib algorithms support customization in -this way as well. For example, the following code produces an L1 regularized -variant of SVMs with regularization parameter set to 0.1, and runs the training -algorithm for 200 iterations. - -{% highlight scala %} -import org.apache.spark.mllib.optimization.L1Updater - -val svmAlg = new SVMWithSGD() -svmAlg.optimizer.setNumIterations(200) - .setRegParam(0.1) - .setUpdater(new L1Updater) -val modelL1 = svmAlg.run(parsedData) -{% endhighlight %} - -## Linear Regression -The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The -example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We -compute the Mean Squared Error at the end to evaluate -[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit) - -{% highlight scala %} -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.regression.LabeledPoint - -// Load and parse the data -val data = sc.textFile("mllib/data/ridge-data/lpsa.data") -val parsedData = data.map { line => - val parts = line.split(',') - LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray) -} - -// Building the model -val numIterations = 20 -val model = LinearRegressionWithSGD.train(parsedData, numIterations) - -// Evaluate model on training examples and compute training error -val valuesAndPreds = parsedData.map { point => - val prediction = model.predict(point.features) - (point.label, prediction) -} -val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count -println("training Mean Squared Error = " + MSE) -{% endhighlight %} - - -Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training -[Mean Squared Errors](http://en.wikipedia.org/wiki/Mean_squared_error). - -## Clustering -In the following example after loading and parsing data, we use the KMeans object to cluster the data -into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within -Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the -optimal *k* is usually one where there is an "elbow" in the WSSSE graph. - -{% highlight scala %} -import org.apache.spark.mllib.clustering.KMeans - -// Load and parse the data -val data = sc.textFile("kmeans_data.txt") -val parsedData = data.map( _.split(' ').map(_.toDouble)) - -// Cluster the data into two classes using KMeans -val numIterations = 20 -val numClusters = 2 -val clusters = KMeans.train(parsedData, numClusters, numIterations) - -// Evaluate clustering by computing Within Set Sum of Squared Errors -val WSSSE = clusters.computeCost(parsedData) -println("Within Set Sum of Squared Errors = " + WSSSE) -{% endhighlight %} - - -## Collaborative Filtering -In the following example we load rating data. Each row consists of a user, a product and a rating. -We use the default ALS.train() method which assumes ratings are explicit. We evaluate the recommendation -model by measuring the Mean Squared Error of rating prediction. - -{% highlight scala %} -import org.apache.spark.mllib.recommendation.ALS -import org.apache.spark.mllib.recommendation.Rating - -// Load and parse the data -val data = sc.textFile("mllib/data/als/test.data") -val ratings = data.map(_.split(',') match { - case Array(user, item, rate) => Rating(user.toInt, item.toInt, rate.toDouble) -}) - -// Build the recommendation model using ALS -val numIterations = 20 -val model = ALS.train(ratings, 1, 20, 0.01) - -// Evaluate the model on rating data -val usersProducts = ratings.map{ case Rating(user, product, rate) => (user, product)} -val predictions = model.predict(usersProducts).map{ - case Rating(user, product, rate) => ((user, product), rate) -} -val ratesAndPreds = ratings.map{ - case Rating(user, product, rate) => ((user, product), rate) -}.join(predictions) -val MSE = ratesAndPreds.map{ - case ((user, product), (r1, r2)) => math.pow((r1- r2), 2) -}.reduce(_ + _)/ratesAndPreds.count -println("Mean Squared Error = " + MSE) -{% endhighlight %} - -If the rating matrix is derived from other source of information (i.e., it is inferred from -other signals), you can use the trainImplicit method to get better results. - -{% highlight scala %} -val model = ALS.trainImplicit(ratings, 1, 20, 0.01) -{% endhighlight %} - -# Using MLLib in Java - -All of MLlib's methods use Java-friendly types, so you can import and call them there the same -way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the -Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by -calling `.rdd()` on your `JavaRDD` object. - -# Using MLLib in Python -Following examples can be tested in the PySpark shell. - -## Binary Classification -The following example shows how to load a sample dataset, build Logistic Regression model, -and make predictions with the resulting model to compute the training error. - -{% highlight python %} -from pyspark.mllib.classification import LogisticRegressionWithSGD -from numpy import array - -# Load and parse the data -data = sc.textFile("mllib/data/sample_svm_data.txt") -parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) -model = LogisticRegressionWithSGD.train(parsedData) - -# Build the model -labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)), - model.predict(point.take(range(1, point.size))))) - -# Evaluating the model on training data -trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) -print("Training Error = " + str(trainErr)) -{% endhighlight %} - -## Linear Regression -The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint. The -example then uses LinearRegressionWithSGD to build a simple linear model to predict label values. We -compute the Mean Squared Error at the end to evaluate -[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit) - -{% highlight python %} -from pyspark.mllib.regression import LinearRegressionWithSGD -from numpy import array - -# Load and parse the data -data = sc.textFile("mllib/data/ridge-data/lpsa.data") -parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')])) - -# Build the model -model = LinearRegressionWithSGD.train(parsedData) - -# Evaluate the model on training data -valuesAndPreds = parsedData.map(lambda point: (point.item(0), - model.predict(point.take(range(1, point.size))))) -MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count() -print("Mean Squared Error = " + str(MSE)) -{% endhighlight %} - - -## Clustering -In the following example after loading and parsing data, we use the KMeans object to cluster the data -into two clusters. The number of desired clusters is passed to the algorithm. We then compute Within -Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasing *k*. In fact the -optimal *k* is usually one where there is an "elbow" in the WSSSE graph. - -{% highlight python %} -from pyspark.mllib.clustering import KMeans -from numpy import array -from math import sqrt - -# Load and parse the data -data = sc.textFile("kmeans_data.txt") -parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) - -# Build the model (cluster the data) -clusters = KMeans.train(parsedData, 2, maxIterations=10, - runs=30, initialization_mode="random") - -# Evaluate clustering by computing Within Set Sum of Squared Errors -def error(point): - center = clusters.centers[clusters.predict(point)] - return sqrt(sum([x**2 for x in (point - center)])) - -WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) -print("Within Set Sum of Squared Error = " + str(WSSSE)) -{% endhighlight %} - -Similarly you can use RidgeRegressionWithSGD and LassoWithSGD and compare training Mean Squared -Errors. - -## Collaborative Filtering -In the following example we load rating data. Each row consists of a user, a product and a rating. -We use the default ALS.train() method which assumes ratings are explicit. We evaluate the -recommendation by measuring the Mean Squared Error of rating prediction. - -{% highlight python %} -from pyspark.mllib.recommendation import ALS -from numpy import array - -# Load and parse the data -data = sc.textFile("mllib/data/als/test.data") -ratings = data.map(lambda line: array([float(x) for x in line.split(',')])) - -# Build the recommendation model using Alternating Least Squares -model = ALS.train(ratings, 1, 20) - -# Evaluate the model on training data -testdata = ratings.map(lambda p: (int(p[0]), int(p[1]))) -predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) -ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) -MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count() -print("Mean Squared Error = " + str(MSE)) -{% endhighlight %} - -If the rating matrix is derived from other source of information (i.e., it is inferred from other -signals), you can use the trainImplicit method to get better results. - -{% highlight python %} -# Build the recommendation model using Alternating Least Squares based on implicit ratings -model = ALS.trainImplicit(ratings, 1, 20) -{% endhighlight %} - - -# Singular Value Decomposition -Singular Value Decomposition for Tall and Skinny matrices. -Given an *m x n* matrix *A*, we can compute matrices *U, S, V* such that - -*A = U * S * V^T* - -There is no restriction on m, but we require n^2 doubles to -fit in memory locally on one machine. -Further, n should be less than m. - -The decomposition is computed by first computing *A^TA = V S^2 V^T*, -computing SVD locally on that (since n x n is small), -from which we recover S and V. -Then we compute U via easy matrix multiplication -as *U = A * V * S^-1* - -Only singular vectors associated with largest k singular values -are recovered. If there are k -such values, then the dimensions of the return will be: - -* *S* is *k x k* and diagonal, holding the singular values on diagonal. -* *U* is *m x k* and satisfies U^T*U = eye(k). -* *V* is *n x k* and satisfies V^TV = eye(k). - -All input and output is expected in sparse matrix format, 0-indexed -as tuples of the form ((i,j),value) all in -SparseMatrix RDDs. Below is example usage. - -{% highlight scala %} - -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg.SVD -import org.apache.spark.mllib.linalg.SparseMatrix -import org.apache.spark.mllib.linalg.MatrixEntry - -// Load and parse the data file -val data = sc.textFile("mllib/data/als/test.data").map { line => - val parts = line.split(',') - MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble) -} -val m = 4 -val n = 4 -val k = 1 - -// recover largest singular vector -val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k) -val = decomposed.S.data - -println("singular values = " + s.toArray.mkString) -{% endhighlight %} \ No newline at end of file diff --git a/docs/mllib-linear-algebra.md b/docs/mllib-linear-algebra.md new file mode 100644 index 0000000000000..cc203d833d344 --- /dev/null +++ b/docs/mllib-linear-algebra.md @@ -0,0 +1,61 @@ +--- +layout: global +title: MLlib - Linear Algebra +--- + +* Table of contents +{:toc} + + +# Singular Value Decomposition +Singular Value `Decomposition` for Tall and Skinny matrices. +Given an `$m \times n$` matrix `$A$`, we can compute matrices `$U,S,V$` such that + +`\[ + A = U \cdot S \cdot V^T + \]` + +There is no restriction on m, but we require n^2 doubles to +fit in memory locally on one machine. +Further, n should be less than m. + +The decomposition is computed by first computing `$A^TA = V S^2 V^T$`, +computing SVD locally on that (since `$n \times n$` is small), +from which we recover `$S$` and `$V$`. +Then we compute U via easy matrix multiplication +as `$U = A \cdot V \cdot S^{-1}$`. + +Only singular vectors associated with largest k singular values +are recovered. If there are k +such values, then the dimensions of the return will be: + +* `$S$` is `$k \times k$` and diagonal, holding the singular values on diagonal. +* `$U$` is `$m \times k$` and satisfies `$U^T U = \mathop{eye}(k)$`. +* `$V$` is `$n \times k$` and satisfies `$V^T V = \mathop{eye}(k)$`. + +All input and output is expected in sparse matrix format, 0-indexed +as tuples of the form ((i,j),value) all in +SparseMatrix RDDs. Below is example usage. + +{% highlight scala %} + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg.SVD +import org.apache.spark.mllib.linalg.SparseMatrix +import org.apache.spark.mllib.linalg.MatrixEntry + +// Load and parse the data file +val data = sc.textFile("mllib/data/als/test.data").map { line => + val parts = line.split(',') + MatrixEntry(parts(0).toInt, parts(1).toInt, parts(2).toDouble) +} +val m = 4 +val n = 4 +val k = 1 + +// recover largest singular vector +val decomposed = SVD.sparseSVD(SparseMatrix(data, m, n), k) +val = decomposed.S.data + +println("singular values = " + s.toArray.mkString) +{% endhighlight %} diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md new file mode 100644 index 0000000000000..428284ef29fd7 --- /dev/null +++ b/docs/mllib-optimization.md @@ -0,0 +1,40 @@ +--- +layout: global +title: MLlib - Optimization +--- + +* Table of contents +{:toc} + + +# Gradient Descent Primitive + +[Gradient descent](http://en.wikipedia.org/wiki/Gradient_descent) (along with +stochastic variants thereof) are first-order optimization methods that are +well-suited for large-scale and distributed computation. Gradient descent +methods aim to find a local minimum of a function by iteratively taking steps +in the direction of the negative gradient of the function at the current point, +i.e., the current parameter value. Gradient descent is included as a low-level +primitive in MLlib, upon which various ML algorithms are developed, and has the +following parameters: + +* *gradient* is a class that computes the stochastic gradient of the function +being optimized, i.e., with respect to a single training example, at the +current parameter value. MLlib includes gradient classes for common loss +functions, e.g., hinge, logistic, least-squares. The gradient class takes as +input a training example, its label, and the current parameter value. +* *updater* is a class that updates weights in each iteration of gradient +descent. MLlib includes updaters for cases without regularization, as well as +L1 and L2 regularizers. +* *stepSize* is a scalar value denoting the initial step size for gradient +descent. All updaters in MLlib use a step size at the t-th step equal to +stepSize / sqrt(t). +* *numIterations* is the number of iterations to run. +* *regParam* is the regularization parameter when using L1 or L2 regularization. +* *miniBatchFraction* is the fraction of the data used to compute the gradient +at each iteration. + +Available algorithms for gradient descent: + +* [GradientDescent](api/mllib/index.html#org.apache.spark.mllib.optimization.GradientDescent) + From 78050805bc691a00788f6e51f23dd785ca25b227 Mon Sep 17 00:00:00 2001 From: Jey Kottalam Date: Sat, 8 Feb 2014 12:24:08 -0800 Subject: [PATCH 002/127] Merge pull request #454 from jey/atomic-sbt-download. Closes #454. Make sbt download an atomic operation Modifies the `sbt/sbt` script to gracefully recover when a previous invocation died in the middle of downloading the SBT jar. Author: Jey Kottalam == Merge branch commits == commit 6c600eb434a2f3e7d70b67831aeebde9b5c0f43b Author: Jey Kottalam Date: Fri Jan 17 10:43:54 2014 -0800 Make sbt download an atomic operation --- sbt/sbt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sbt/sbt b/sbt/sbt index 62ead8a69dbf6..8472dce589bcc 100755 --- a/sbt/sbt +++ b/sbt/sbt @@ -29,10 +29,11 @@ JAR=sbt/sbt-launch-${SBT_VERSION}.jar if [ ! -f ${JAR} ]; then # Download printf "Attempting to fetch sbt\n" + JAR_DL=${JAR}.part if hash curl 2>/dev/null; then - curl --progress-bar ${URL1} > ${JAR} || curl --progress-bar ${URL2} > ${JAR} + (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR} elif hash wget 2>/dev/null; then - wget --progress=bar ${URL1} -O ${JAR} || wget --progress=bar ${URL2} -O ${JAR} + (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" exit -1 From f0ce736fadbcb7642b6148ad740f4508cd7dcd4d Mon Sep 17 00:00:00 2001 From: Qiuzhuang Lian Date: Sat, 8 Feb 2014 12:59:48 -0800 Subject: [PATCH 003/127] Merge pull request #561 from Qiuzhuang/master. Closes #561. Kill drivers in postStop() for Worker. JIRA SPARK-1068:https://spark-project.atlassian.net/browse/SPARK-1068 Author: Qiuzhuang Lian == Merge branch commits == commit 9c19ce63637eee9369edd235979288d3d9fc9105 Author: Qiuzhuang Lian Date: Sat Feb 8 16:07:39 2014 +0800 Kill drivers in postStop() for Worker. JIRA SPARK-1068:https://spark-project.atlassian.net/browse/SPARK-1068 --- core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala old mode 100644 new mode 100755 index fbf2e0f30fde9..f4ee0e2343849 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -325,6 +325,7 @@ private[spark] class Worker( override def postStop() { executors.values.foreach(_.kill()) + drivers.values.foreach(_.kill()) webUi.stop() metricsSystem.stop() } From c2341c92bb206938fd9b18e2a714e5c6de55b06d Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Sat, 8 Feb 2014 16:00:43 -0800 Subject: [PATCH 004/127] Merge pull request #542 from markhamstra/versionBump. Closes #542. Version number to 1.0.0-SNAPSHOT Since 0.9.0-incubating is done and out the door, we shouldn't be building 0.9.0-incubating-SNAPSHOT anymore. @pwendell Author: Mark Hamstra == Merge branch commits == commit 1b00a8a7c1a7f251b4bb3774b84b9e64758eaa71 Author: Mark Hamstra Date: Wed Feb 5 09:30:32 2014 -0800 Version number to 1.0.0-SNAPSHOT --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- graphx/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- project/SparkBuild.scala | 2 +- python/pyspark/shell.py | 2 +- repl/pom.xml | 2 +- .../src/main/scala/org/apache/spark/repl/SparkILoopInit.scala | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/alpha/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index dacae92d83a8e..82396040251d3 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index cb8e79f22535b..6155ab5f20155 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 9e5a450d57a47..dbcde0f434f94 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/docs/_config.yml b/docs/_config.yml index 60dfb2a9dee85..98784866ce7d2 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -3,8 +3,8 @@ markdown: kramdown # These allow the documentation to be updated with nerw releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 0.9.0-incubating-SNAPSHOT -SPARK_VERSION_SHORT: 0.9.0 +SPARK_VERSION: 1.0.0-incubating-SNAPSHOT +SPARK_VERSION_SHORT: 1.0.0 SCALA_VERSION: "2.10" MESOS_VERSION: 0.13.0 SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net diff --git a/examples/pom.xml b/examples/pom.xml index 7855706389709..874bbd803fe5f 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 443910a03a94e..cdb9bef8e2dc6 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 23b2fead657e6..630bbed5eb006 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 31b4fa87de772..eff3e7809b786 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 216e6c1d8ff44..cde495cac4f97 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index c240d595742cf..04f9fbfd5bf01 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index baa240aff20c3..4823ed1d4eaec 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index dda3900afebdf..9a61d7c3e46c0 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 1ac8f0fa079e0..f94685652ad61 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT pom Spark Project Parent POM http://spark.incubator.apache.org/ diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index e33f230188fc7..11a937e011718 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -120,7 +120,7 @@ object SparkBuild extends Build { def sharedSettings = Defaults.defaultSettings ++ Seq( organization := "org.apache.spark", - version := "0.9.0-incubating-SNAPSHOT", + version := "1.0.0-incubating-SNAPSHOT", scalaVersion := "2.10.3", scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation", "-target:" + SCALAC_JVM_VERSION), diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 1602227a273e7..3d779faf1fa44 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -35,7 +35,7 @@ ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT + /__ / .__/\_,_/_/ /_/\_\ version 1.0.0-SNAPSHOT /_/ """ print "Using Python version %s (%s, %s)" % ( diff --git a/repl/pom.xml b/repl/pom.xml index 2dfe7ac900b83..143b009f3c2a3 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala index 21b1ba305d110..efe45240e9b2e 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala @@ -24,7 +24,7 @@ trait SparkILoopInit { ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT + /___/ .__/\_,_/_/ /_/\_\ version 1.0.0-SNAPSHOT /_/ """) import Properties._ diff --git a/streaming/pom.xml b/streaming/pom.xml index 459756912dbe5..d31ee60e4ea15 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 28f5ef14b1a35..a27f0db6e5628 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/alpha/pom.xml b/yarn/alpha/pom.xml index 8291e9e7a36ce..b026128980cb8 100644 --- a/yarn/alpha/pom.xml +++ b/yarn/alpha/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index aea8b0cddefa2..e7eba36ba351b 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index 62fe3e274250f..7c312206d16d3 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 0.9.0-incubating-SNAPSHOT + 1.0.0-incubating-SNAPSHOT ../pom.xml From f892da8716d614467fddcc3a1b2b589979414219 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sat, 8 Feb 2014 23:13:34 -0800 Subject: [PATCH 005/127] Merge pull request #565 from pwendell/dev-scripts. Closes #565. SPARK-1066: Add developer scripts to repository. These are some developer scripts I've been maintaining in a separate public repo. This patch adds them to the Spark repository so they can evolve here and are clearly accessible to all committers. I may do some small additional clean-up in this PR, but wanted to put them here in case others want to review. There are a few types of scripts here: 1. A tool to merge pull requests. 2. A script for packaging releases. 3. A script for auditing release candidates. Author: Patrick Wendell == Merge branch commits == commit 5d5d331d01f6fd59c2eb830f652955119b012173 Author: Patrick Wendell Date: Sat Feb 8 22:11:47 2014 -0800 SPARK-1066: Add developer scripts to repository. --- dev/README.md | 5 + dev/audit-release/.gitignore | 2 + dev/audit-release/audit_release.py | 227 ++++++++++++++++++ dev/audit-release/blank_maven_build/pom.xml | 47 ++++ dev/audit-release/blank_sbt_build/build.sbt | 29 +++ dev/audit-release/maven_app_core/input.txt | 8 + dev/audit-release/maven_app_core/pom.xml | 56 +++++ .../src/main/java/SimpleApp.java | 41 ++++ dev/audit-release/sbt_app_core/build.sbt | 29 +++ dev/audit-release/sbt_app_core/input.txt | 8 + .../src/main/scala/SparkApp.scala | 36 +++ dev/audit-release/sbt_app_graphx/build.sbt | 29 +++ .../src/main/scala/GraphxApp.scala | 47 ++++ dev/audit-release/sbt_app_streaming/build.sbt | 29 +++ .../src/main/scala/StreamingApp.scala | 62 +++++ dev/create-release/create-release.sh | 132 ++++++++++ dev/merge_spark_pr.py | 197 +++++++++++++++ 17 files changed, 984 insertions(+) create mode 100644 dev/README.md create mode 100644 dev/audit-release/.gitignore create mode 100755 dev/audit-release/audit_release.py create mode 100644 dev/audit-release/blank_maven_build/pom.xml create mode 100644 dev/audit-release/blank_sbt_build/build.sbt create mode 100644 dev/audit-release/maven_app_core/input.txt create mode 100644 dev/audit-release/maven_app_core/pom.xml create mode 100644 dev/audit-release/maven_app_core/src/main/java/SimpleApp.java create mode 100644 dev/audit-release/sbt_app_core/build.sbt create mode 100644 dev/audit-release/sbt_app_core/input.txt create mode 100644 dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala create mode 100644 dev/audit-release/sbt_app_graphx/build.sbt create mode 100644 dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala create mode 100644 dev/audit-release/sbt_app_streaming/build.sbt create mode 100644 dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala create mode 100755 dev/create-release/create-release.sh create mode 100755 dev/merge_spark_pr.py diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 0000000000000..2b0f3d8ee8924 --- /dev/null +++ b/dev/README.md @@ -0,0 +1,5 @@ +# Spark Developer Scripts +This directory contains scripts useful to developers when packaging, +testing, or committing to Spark. + +Many of these scripts require Apache credentials to work correctly. diff --git a/dev/audit-release/.gitignore b/dev/audit-release/.gitignore new file mode 100644 index 0000000000000..7e057a92b3c46 --- /dev/null +++ b/dev/audit-release/.gitignore @@ -0,0 +1,2 @@ +project/ +spark_audit* diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py new file mode 100755 index 0000000000000..4408658f5e33f --- /dev/null +++ b/dev/audit-release/audit_release.py @@ -0,0 +1,227 @@ +#!/usr/bin/python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Audits binary and maven artifacts for a Spark release. +# Requires GPG and Maven. +# usage: +# python audit_release.py + +import os +import re +import shutil +import subprocess +import sys +import time +import urllib2 + +## Fill in release details here: +RELEASE_URL = "http://people.apache.org/~pwendell/spark-0.9.0-incubating-rc5/" +RELEASE_KEY = "9E4FE3AF" +RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1006/" +RELEASE_VERSION = "0.9.0-incubating" +SCALA_VERSION = "2.10.3" +SCALA_BINARY_VERSION = "2.10" +## + +LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") +LOG_FILE = open(LOG_FILE_NAME, 'w') +WORK_DIR = "/tmp/audit_%s" % int(time.time()) +MAVEN_CMD = "mvn" +GPG_CMD = "gpg" + +print "Starting tests, log output in %s. Test results printed below:" % LOG_FILE_NAME + +# Track failures +failures = [] + +def clean_work_files(): + print "OK to delete scratch directory '%s'? (y/N): " % WORK_DIR + response = raw_input() + if response == "y": + shutil.rmtree(WORK_DIR) + print "Should I delete the log output file '%s'? (y/N): " % LOG_FILE_NAME + response = raw_input() + if response == "y": + os.unlink(LOG_FILE_NAME) + +def run_cmd(cmd, exit_on_failure=True): + print >> LOG_FILE, "Running command: %s" % cmd + ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE) + if ret != 0 and exit_on_failure: + print "Command failed: %s" % cmd + clean_work_files() + sys.exit(-1) + return ret + +def run_cmd_with_output(cmd): + print >> sys.stderr, "Running command: %s" % cmd + return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) + +def test(bool, str): + if bool: + return passed(str) + failed(str) + +def passed(str): + print "[PASSED] %s" % str + +def failed(str): + failures.append(str) + print "[**FAILED**] %s" % str + +def get_url(url): + return urllib2.urlopen(url).read() + +original_dir = os.getcwd() + +# For each of these modules, we'll test an 'empty' application in sbt and +# maven that links against them. This will catch issues with messed up +# dependencies within those projects. +modules = ["spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl", + "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka", + "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq"] +modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules) + +# Check for directories that might interfere with tests +local_ivy_spark = "~/.ivy2/local/org.apache.spark" +cache_ivy_spark = "~/.ivy2/cache/org.apache.spark" +local_maven_kafka = "~/.m2/repository/org/apache/kafka" +local_maven_kafka = "~/.m2/repository/org/apache/spark" +def ensure_path_not_present(x): + if os.path.exists(os.path.expanduser(x)): + print "Please remove %s, it can interfere with testing published artifacts." % x + sys.exit(-1) +map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka]) + +# SBT build tests +os.chdir("blank_sbt_build") +os.environ["SPARK_VERSION"] = RELEASE_VERSION +os.environ["SCALA_VERSION"] = SCALA_VERSION +os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY +for module in modules: + os.environ["SPARK_MODULE"] = module + ret = run_cmd("sbt clean update", exit_on_failure=False) + test(ret == 0, "sbt build against '%s' module" % module) +os.chdir(original_dir) + +# SBT application tests +for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming"]: + os.chdir(app) + ret = run_cmd("sbt clean run", exit_on_failure=False) + test(ret == 0, "sbt application (%s)" % app) + os.chdir(original_dir) + +# Maven build tests +os.chdir("blank_maven_build") +for module in modules: + cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' + '-Dspark.module="%s" clean compile' % + (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module)) + ret = run_cmd(cmd, exit_on_failure=False) + test(ret == 0, "maven build against '%s' module" % module) +os.chdir(original_dir) + +os.chdir("maven_app_core") +mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' + '-Dscala.binary.version="%s" clean compile ' + 'exec:java -Dexec.mainClass="SimpleApp"' % + (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION)) +ret = run_cmd(mvn_exec_cmd, exit_on_failure=False) +test(ret == 0, "maven application (core)") +os.chdir(original_dir) + +# Binary artifact tests +if os.path.exists(WORK_DIR): + print "Working directory '%s' already exists" % WORK_DIR + sys.exit(-1) +os.mkdir(WORK_DIR) +os.chdir(WORK_DIR) + +index_page = get_url(RELEASE_URL) +artifact_regex = r = re.compile("") +artifacts = r.findall(index_page) + +for artifact in artifacts: + print "==== Verifying download integrity for artifact: %s ====" % artifact + + artifact_url = "%s/%s" % (RELEASE_URL, artifact) + run_cmd("wget %s" % artifact_url) + + key_file = "%s.asc" % artifact + run_cmd("wget %s/%s" % (RELEASE_URL, key_file)) + + run_cmd("wget %s%s" % (artifact_url, ".sha")) + + # Verify signature + run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY)) + run_cmd("%s %s" % (GPG_CMD, key_file)) + passed("Artifact signature verified.") + + # Verify md5 + my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip() + release_md5 = get_url("%s.md5" % artifact_url).strip() + test(my_md5 == release_md5, "Artifact MD5 verified.") + + # Verify sha + my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip() + release_sha = get_url("%s.sha" % artifact_url).strip() + test(my_sha == release_sha, "Artifact SHA verified.") + + # Verify Apache required files + dir_name = artifact.replace(".tgz", "") + run_cmd("tar xvzf %s" % artifact) + base_files = os.listdir(dir_name) + test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file") + test("NOTICE" in base_files, "Tarball contains NOTICE file") + test("LICENSE" in base_files, "Tarball contains LICENSE file") + + os.chdir(os.path.join(WORK_DIR, dir_name)) + readme = "".join(open("README.md").readlines()) + disclaimer_part = "is an effort undergoing incubation" + test(disclaimer_part in readme, "README file contains disclaimer") + os.chdir(WORK_DIR) + +for artifact in artifacts: + print "==== Verifying build and tests for artifact: %s ====" % artifact + os.chdir(os.path.join(WORK_DIR, dir_name)) + + os.environ["MAVEN_OPTS"] = "-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g" + # Verify build + print "==> Running build" + run_cmd("sbt assembly") + passed("sbt build successful") + run_cmd("%s package -DskipTests" % MAVEN_CMD) + passed("Maven build successful") + + # Verify tests + print "==> Performing unit tests" + run_cmd("%s test" % MAVEN_CMD) + passed("Tests successful") + os.chdir(WORK_DIR) + +clean_work_files() + +if len(failures) == 0: + print "ALL TESTS PASSED" +else: + print "SOME TESTS DID NOT PASS" + for f in failures: + print f + +os.chdir(original_dir) diff --git a/dev/audit-release/blank_maven_build/pom.xml b/dev/audit-release/blank_maven_build/pom.xml new file mode 100644 index 0000000000000..047659e4a8b7c --- /dev/null +++ b/dev/audit-release/blank_maven_build/pom.xml @@ -0,0 +1,47 @@ + + + + + spark.audit + spark-audit + 4.0.0 + Spark Release Auditor + jar + 1.0 + + + Spray.cc repository + http://repo.spray.cc + + + Akka repository + http://repo.akka.io/releases + + + Spark Staging Repo + ${spark.release.repository} + + + + + org.apache.spark + ${spark.module} + ${spark.version} + + + diff --git a/dev/audit-release/blank_sbt_build/build.sbt b/dev/audit-release/blank_sbt_build/build.sbt new file mode 100644 index 0000000000000..1cf52743f27f4 --- /dev/null +++ b/dev/audit-release/blank_sbt_build/build.sbt @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Spark Release Auditor" + +version := "1.0" + +scalaVersion := "2.9.3" + +libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Akka Repository" at "http://repo.akka.io/releases/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/maven_app_core/input.txt b/dev/audit-release/maven_app_core/input.txt new file mode 100644 index 0000000000000..837b6f85ae97f --- /dev/null +++ b/dev/audit-release/maven_app_core/input.txt @@ -0,0 +1,8 @@ +a +b +c +d +a +b +c +d diff --git a/dev/audit-release/maven_app_core/pom.xml b/dev/audit-release/maven_app_core/pom.xml new file mode 100644 index 0000000000000..0b837c01751fe --- /dev/null +++ b/dev/audit-release/maven_app_core/pom.xml @@ -0,0 +1,56 @@ + + + + + spark.audit + spark-audit + 4.0.0 + Simple Project + jar + 1.0 + + + Spray.cc repository + http://repo.spray.cc + + + Akka repository + http://repo.akka.io/releases + + + Spark Staging Repo + ${spark.release.repository} + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + + + + + + + maven-compiler-plugin + 2.3.2 + + + + diff --git a/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java new file mode 100644 index 0000000000000..6b65dda39b1a2 --- /dev/null +++ b/dev/audit-release/maven_app_core/src/main/java/SimpleApp.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; + +public class SimpleApp { + public static void main(String[] args) { + String logFile = "input.txt"; + JavaSparkContext sc = new JavaSparkContext("local", "Simple App"); + JavaRDD logData = sc.textFile(logFile).cache(); + + long numAs = logData.filter(new Function() { + public Boolean call(String s) { return s.contains("a"); } + }).count(); + + long numBs = logData.filter(new Function() { + public Boolean call(String s) { return s.contains("b"); } + }).count(); + + if (numAs != 2 || numBs != 2) { + System.out.println("Failed to parse log files with Spark"); + System.exit(-1); + } + System.out.println("Test succeeded"); + } +} diff --git a/dev/audit-release/sbt_app_core/build.sbt b/dev/audit-release/sbt_app_core/build.sbt new file mode 100644 index 0000000000000..97a8cc3a4e095 --- /dev/null +++ b/dev/audit-release/sbt_app_core/build.sbt @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Akka Repository" at "http://repo.akka.io/releases/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_core/input.txt b/dev/audit-release/sbt_app_core/input.txt new file mode 100644 index 0000000000000..837b6f85ae97f --- /dev/null +++ b/dev/audit-release/sbt_app_core/input.txt @@ -0,0 +1,8 @@ +a +b +c +d +a +b +c +d diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala new file mode 100644 index 0000000000000..d49de8b73a856 --- /dev/null +++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main.scala + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ + +object SimpleApp { + def main(args: Array[String]) { + val logFile = "input.txt" + val sc = new SparkContext("local", "Simple App") + val logData = sc.textFile(logFile, 2).cache() + val numAs = logData.filter(line => line.contains("a")).count() + val numBs = logData.filter(line => line.contains("b")).count() + if (numAs != 2 || numBs != 2) { + println("Failed to parse log files with Spark") + System.exit(-1) + } + println("Test succeeded") + } +} diff --git a/dev/audit-release/sbt_app_graphx/build.sbt b/dev/audit-release/sbt_app_graphx/build.sbt new file mode 100644 index 0000000000000..66f2db357d49b --- /dev/null +++ b/dev/audit-release/sbt_app_graphx/build.sbt @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Akka Repository" at "http://repo.akka.io/releases/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala new file mode 100644 index 0000000000000..da08e014ebd94 --- /dev/null +++ b/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main.scala + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD + +object GraphXApp { + def main(args: Array[String]) { + val sc = new SparkContext("local", "Simple GraphX App") + val users: RDD[(VertexId, (String, String))] = + sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), + (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), + (4L, ("peter", "student")))) + val relationships: RDD[Edge[String]] = + sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), + Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"), + Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague"))) + val defaultUser = ("John Doe", "Missing") + val graph = Graph(users, relationships, defaultUser) + // Notice that there is a user 0 (for which we have no information) connected to users + // 4 (peter) and 5 (franklin). + val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect + if (!triplets.exists(_ == ("peter", "John Doe"))) { + println("Failed to run GraphX") + System.exit(-1) + } + println("Test succeeded") + } +} diff --git a/dev/audit-release/sbt_app_streaming/build.sbt b/dev/audit-release/sbt_app_streaming/build.sbt new file mode 100644 index 0000000000000..492e5e7c8d763 --- /dev/null +++ b/dev/audit-release/sbt_app_streaming/build.sbt @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Akka Repository" at "http://repo.akka.io/releases/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala new file mode 100644 index 0000000000000..3d0722d2ac45e --- /dev/null +++ b/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main.scala + +import scala.collection.mutable.{ListBuffer, Queue} + +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming._ + +object SparkStreamingExample { + + def main(args: Array[String]) { + val conf = new SparkConf(true) + .setMaster("local[2]") + .setAppName("Streaming test") + val ssc = new StreamingContext(conf, Seconds(1)) + val seen = ListBuffer[RDD[Int]]() + + val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10) + val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10) + val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10) + + val queue = Queue(rdd1, rdd2, rdd3) + val stream = ssc.queueStream(queue) + + stream.foreachRDD(rdd => seen += rdd) + ssc.start() + Thread.sleep(5000) + + def test(f: => Boolean, failureMsg: String) = { + if (!f) { + println(failureMsg) + System.exit(-1) + } + } + + val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0) + test(rddCounts.length == 3, "Did not collect three RDD's from stream") + test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams") + + println("Test succeeded") + + ssc.stop() + } +} diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh new file mode 100755 index 0000000000000..7cebace5069f8 --- /dev/null +++ b/dev/create-release/create-release.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Quick-and-dirty automation of making maven and binary releases. Not robust at all. +# Publishes releases to Maven and packages/copies binary release artifacts. +# Expects to be run in a totally empty directory. +# +# Would be nice to add: +# - Send output to stderr and have useful logging in stdout +# - Have this use sbt rather than Maven release plug in + +GIT_USERNAME=pwendell +GIT_PASSWORD=XXX +GPG_PASSPHRASE=XXX +GIT_BRANCH=branch-0.9 +RELEASE_VERSION=0.9.0-incubating +RC_NAME=rc2 +USER_NAME=pwendell + +set -e + +GIT_TAG=v$RELEASE_VERSION + +# Artifact publishing + +git clone https://git-wip-us.apache.org/repos/asf/incubator-spark.git -b $GIT_BRANCH +cd incubator-spark +export MAVEN_OPTS="-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g" + +mvn -Pyarn release:clean + +mvn -DskipTests \ + -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \ + -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \ + -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \ + -Pyarn \ + -Dtag=$GIT_TAG -DautoVersionSubmodules=true \ + --batch-mode release:prepare + +mvn -DskipTests \ + -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \ + -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \ + -Pyarn \ + release:perform + +rm -rf incubator-spark + +# Source and binary tarballs +git clone https://git-wip-us.apache.org/repos/asf/incubator-spark.git +cd incubator-spark +git checkout --force $GIT_TAG +release_hash=`git rev-parse HEAD` + +rm .gitignore +rm -rf .git +cd .. + +cp -r incubator-spark spark-$RELEASE_VERSION +tar cvzf spark-$RELEASE_VERSION.tgz spark-$RELEASE_VERSION +echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --armour --output spark-$RELEASE_VERSION.tgz.asc \ + --detach-sig spark-$RELEASE_VERSION.tgz +echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md MD5 spark-$RELEASE_VERSION.tgz > \ + spark-$RELEASE_VERSION.tgz.md5 +echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md SHA512 spark-$RELEASE_VERSION.tgz > \ + spark-$RELEASE_VERSION.tgz.sha +rm -rf spark-$RELEASE_VERSION + +make_binary_release() { + NAME=$1 + MAVEN_FLAGS=$2 + + cp -r incubator-spark spark-$RELEASE_VERSION-bin-$NAME + cd spark-$RELEASE_VERSION-bin-$NAME + export MAVEN_OPTS="-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g" + mvn $MAVEN_FLAGS -DskipTests clean package + find . -name test-classes -type d | xargs rm -rf + find . -name classes -type d | xargs rm -rf + cd .. + tar cvzf spark-$RELEASE_VERSION-bin-$NAME.tgz spark-$RELEASE_VERSION-bin-$NAME + echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --armour \ + --output spark-$RELEASE_VERSION-bin-$NAME.tgz.asc \ + --detach-sig spark-$RELEASE_VERSION-bin-$NAME.tgz + echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md \ + MD5 spark-$RELEASE_VERSION-bin-$NAME.tgz > \ + spark-$RELEASE_VERSION-bin-$NAME.tgz.md5 + echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --print-md \ + SHA512 spark-$RELEASE_VERSION-bin-$NAME.tgz > \ + spark-$RELEASE_VERSION-bin-$NAME.tgz.sha + rm -rf spark-$RELEASE_VERSION-bin-$NAME +} + +make_binary_release "hadoop1" "-Dhadoop.version=1.0.4" +make_binary_release "cdh4" "-Dhadoop.version=2.0.0-mr1-cdh4.2.0" +make_binary_release "hadoop2" "-Pyarn -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0" + +# Copy data +echo "Copying release tarballs" +ssh $USER_NAME@people.apache.org \ + mkdir /home/$USER_NAME/public_html/spark-$RELEASE_VERSION-$RC_NAME +rc_folder=spark-$RELEASE_VERSION-$RC_NAME +scp spark* \ + $USER_NAME@people.apache.org:/home/$USER_NAME/public_html/$rc_folder/ + +# Docs +cd incubator-spark +cd docs +jekyll build +echo "Copying release documentation" +rc_docs_folder=${rc_folder}-docs +rsync -r _site/* $USER_NAME@people.apache.org /home/$USER_NAME/public_html/$rc_docs_folder + +echo "Release $RELEASE_VERSION completed:" +echo "Git tag:\t $GIT_TAG" +echo "Release commit:\t $release_hash" +echo "Binary location:\t http://people.apache.org/~$USER_NAME/$rc_folder" +echo "Doc location:\t http://people.apache.org/~$USER_NAME/$rc_docs_folder" diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py new file mode 100755 index 0000000000000..40a02cba82820 --- /dev/null +++ b/dev/merge_spark_pr.py @@ -0,0 +1,197 @@ +#!/usr/bin/python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Utility for creating well-formed pull request merges and pushing them to Apache. +# usage: ./apache-pr-merge.py (see config env vars below) +# +# This utility assumes you already have local a Spark git folder and that you +# have added remotes corresponding to both (i) the github apache Spark +# mirror and (ii) the apache git repo. + +import json +import os +import subprocess +import sys +import tempfile +import urllib2 + +# Location of your Spark git development area +SPARK_HOME = os.environ.get("SPARK_HOME", "/home/patrick/Documents/spark") +# Remote name which points to the Gihub site +PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github") +# Remote name which points to Apache git +PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache") + +GIT_API_BASE = "https://api.github.com/repos/apache/incubator-spark" +# Prefix added to temporary branches +BRANCH_PREFIX = "PR_TOOL" + +os.chdir(SPARK_HOME) + +def get_json(url): + try: + return json.load(urllib2.urlopen(url)) + except urllib2.HTTPError as e: + print "Unable to fetch URL, exiting: %s" % url + sys.exit(-1) + +def fail(msg): + print msg + clean_up() + sys.exit(-1) + +def run_cmd(cmd): + if isinstance(cmd, list): + return subprocess.check_output(cmd) + else: + return subprocess.check_output(cmd.split(" ")) + +def continue_maybe(prompt): + result = raw_input("\n%s (y/n): " % prompt) + if result.lower() != "y": + fail("Okay, exiting") + +original_head = run_cmd("git rev-parse HEAD")[:8] + +def clean_up(): + print "Restoring head pointer to %s" % original_head + run_cmd("git checkout %s" % original_head) + + branches = run_cmd("git branch").replace(" ", "").split("\n") + + for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): + print "Deleting local branch %s" % branch + run_cmd("git branch -D %s" % branch) + +# merge the requested PR and return the merge hash +def merge_pr(pr_num, target_ref): + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) + target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) + run_cmd("git checkout %s" % target_branch_name) + + run_cmd(['git', 'merge', pr_branch_name, '--squash']) + + commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, + '--pretty=format:%an <%ae>']).split("\n") + distinct_authors = sorted(set(commit_authors), key=lambda x: commit_authors.count(x), reverse=True) + primary_author = distinct_authors[0] + commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name]).split("\n\n") + + merge_message = "Merge pull request #%s from %s. Closes #%s.\n\n%s\n\n%s" % ( + pr_num, pr_repo_desc, pr_num, title, body) + merge_message_parts = merge_message.split("\n\n") + merge_message_flags = [] + + for p in merge_message_parts: + merge_message_flags = merge_message_flags + ["-m", p] + authors = "\n".join(["Author: %s" % a for a in distinct_authors]) + merge_message_flags = merge_message_flags + ["-m", authors] + merge_message_flags = merge_message_flags + ["-m", "== Merge branch commits =="] + for c in commits: + merge_message_flags = merge_message_flags + ["-m", c] + + run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags) + + continue_maybe("Merge complete (local ref %s). Push to %s?" % ( + target_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8] + clean_up() + print("Pull request #%s merged!" % pr_num) + print("Merge hash: %s" % merge_hash) + return merge_hash + + +def maybe_cherry_pick(pr_num, merge_hash, default_branch): + continue_maybe("Would you like to pick %s into another branch?" % merge_hash) + pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + if pick_ref == "": + pick_ref = default_branch + + pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) + + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) + run_cmd("git checkout %s" % pick_branch_name) + run_cmd("git cherry-pick -sx %s" % merge_hash) + + continue_maybe("Pick complete (local ref %s). Push to %s?" % ( + pick_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8] + clean_up() + + print("Pull request #%s picked into %s!" % (pr_num, pick_ref)) + print("Pick hash: %s" % pick_hash) + +branches = get_json("%s/branches" % GIT_API_BASE) +branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) +# Assumes branch names can be sorted lexicographically +latest_branch = sorted(branch_names, reverse=True)[0] + +pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") +pr = get_json("%s/pulls/%s" % (GIT_API_BASE, pr_num)) + +url = pr["url"] +title = pr["title"] +body = pr["body"] +target_ref = pr["base"]["ref"] +user_login = pr["user"]["login"] +base_ref = pr["head"]["ref"] +pr_repo_desc = "%s/%s" % (user_login, base_ref) + +if pr["merged"] == True: + print "Pull request %s has already been merged, assuming you want to backport" % pr_num + merge_commit_desc = run_cmd(['git', 'log', '--merges', '--first-parent', + '--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0] + if merge_commit_desc == "": + fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) + + merge_hash = merge_commit_desc[:7] + message = merge_commit_desc[8:] + + print "Found: %s" % message + maybe_cherry_pick(pr_num, merge_hash, latest_branch) + sys.exit(0) + +if bool(pr["mergeable"]) == False: + fail("Pull request %s is not mergeable in its current form" % pr_num) + +print ("\n=== Pull Request #%s ===" % pr_num) +print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( + title, pr_repo_desc, target_ref, url)) +continue_maybe("Proceed with merging pull request #%s?" % pr_num) + +merge_hash = merge_pr(pr_num, target_ref) + +while True: + maybe_cherry_pick(pr_num, merge_hash, latest_branch) From b6d40b782327188a25ded5b22790552121e5271f Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sat, 8 Feb 2014 23:35:31 -0800 Subject: [PATCH 006/127] Merge pull request #560 from pwendell/logging. Closes #560. [WIP] SPARK-1067: Default log4j initialization causes errors for those not using log4j To fix this - we add a check when initializing log4j. Author: Patrick Wendell == Merge branch commits == commit ffdce513877f64b6eed6d36138c3e0003d392889 Author: Patrick Wendell Date: Fri Feb 7 15:22:29 2014 -0800 Logging fix --- core/src/main/scala/org/apache/spark/Logging.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala index b749e5414dab6..7423082e34f47 100644 --- a/core/src/main/scala/org/apache/spark/Logging.scala +++ b/core/src/main/scala/org/apache/spark/Logging.scala @@ -19,6 +19,7 @@ package org.apache.spark import org.apache.log4j.{LogManager, PropertyConfigurator} import org.slf4j.{Logger, LoggerFactory} +import org.slf4j.impl.StaticLoggerBinder /** * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows @@ -101,9 +102,11 @@ trait Logging { } private def initializeLogging() { - // If Log4j doesn't seem initialized, load a default properties file + // If Log4j is being used, but is not initialized, load a default properties file + val binder = StaticLoggerBinder.getSingleton + val usingLog4j = binder.getLoggerFactoryClassStr.endsWith("Log4jLoggerFactory") val log4jInitialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements - if (!log4jInitialized) { + if (!log4jInitialized && usingLog4j) { val defaultLogProps = "org/apache/spark/log4j-defaults.properties" val classLoader = this.getClass.getClassLoader Option(classLoader.getResource(defaultLogProps)) match { From 2ef37c93664d74de6d7f6144834883a4a4ef79b7 Mon Sep 17 00:00:00 2001 From: jyotiska Date: Sat, 8 Feb 2014 23:36:48 -0800 Subject: [PATCH 007/127] Merge pull request #562 from jyotiska/master. Closes #562. Added example Python code for sort I added an example Python code for sort. Right now, PySpark has limited examples for new people willing to use the project. This example code sorts integers stored in a file. I was able to sort 5 million, 10 million and 25 million integers with this code. Author: jyotiska == Merge branch commits == commit 8ad8faf6c8e02ae1cd68565d98524edf165f54df Author: jyotiska Date: Sun Feb 9 11:00:41 2014 +0530 Added comments in code on collect() method commit 6f98f1e313f4472a7c2207d36c4f0fbcebc95a8c Author: jyotiska Date: Sat Feb 8 13:12:37 2014 +0530 Updated python example code sort.py commit 945e39a5d68daa7e5bab0d96cbd35d7c4b04eafb Author: jyotiska Date: Sat Feb 8 12:59:09 2014 +0530 Added example python code for sort --- python/examples/sort.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100755 python/examples/sort.py diff --git a/python/examples/sort.py b/python/examples/sort.py new file mode 100755 index 0000000000000..5de20a6d98f43 --- /dev/null +++ b/python/examples/sort.py @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +from pyspark import SparkContext + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print >> sys.stderr, "Usage: sort " + exit(-1) + sc = SparkContext(sys.argv[1], "PythonSort") + lines = sc.textFile(sys.argv[2], 1) + sortedCount = lines.flatMap(lambda x: x.split(' ')) \ + .map(lambda x: (int(x), 1)) \ + .sortByKey(lambda x: x) + # This is just a demo on how to bring all the sorted data back to a single node. + # In reality, we wouldn't want to collect all the data to the driver node. + output = sortedCount.collect() + for (num, unitcount) in output: + print num From b6dba10ae59215b5c4e40f7632563f592f138c87 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Feb 2014 23:39:17 -0800 Subject: [PATCH 008/127] Merge pull request #556 from CodingCat/JettyUtil. Closes #556. [SPARK-1060] startJettyServer should explicitly use IP information https://spark-project.atlassian.net/browse/SPARK-1060 In the current implementation, the webserver in Master/Worker is started with val (srv, bPort) = JettyUtils.startJettyServer("0.0.0.0", port, handlers) inside startJettyServer: val server = new Server(currentPort) //here, the Server will take "0.0.0.0" as the hostname, i.e. will always bind to the IP address of the first NIC this can cause wrong IP binding, e.g. if the host has two NICs, N1 and N2, the user specify the SPARK_LOCAL_IP as the N2's IP address, however, when starting the web server, for the reason stated above, it will always bind to the N1's address Author: CodingCat == Merge branch commits == commit 6c6d9a8ccc9ec4590678a3b34cb03df19092029d Author: CodingCat Date: Thu Feb 6 14:53:34 2014 -0500 startJettyServer should explicitly use IP information --- .../org/apache/spark/deploy/master/ui/MasterWebUI.scala | 2 +- .../org/apache/spark/deploy/worker/ui/WorkerWebUI.scala | 2 +- core/src/main/scala/org/apache/spark/ui/JettyUtils.scala | 9 ++++++--- core/src/main/scala/org/apache/spark/ui/SparkUI.scala | 2 +- core/src/test/scala/org/apache/spark/ui/UISuite.scala | 6 +++--- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala index ead35662fc75a..05c4df891ed75 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala @@ -45,7 +45,7 @@ class MasterWebUI(val master: Master, requestedPort: Int) extends Logging { def start() { try { - val (srv, bPort) = JettyUtils.startJettyServer("0.0.0.0", port, handlers) + val (srv, bPort) = JettyUtils.startJettyServer(host, port, handlers) server = Some(srv) boundPort = Some(bPort) logInfo("Started Master web UI at http://%s:%d".format(host, boundPort.get)) diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala index 8daa47b2b2435..c23b75d757456 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala @@ -56,7 +56,7 @@ class WorkerWebUI(val worker: Worker, val workDir: File, requestedPort: Option[I def start() { try { - val (srv, bPort) = JettyUtils.startJettyServer("0.0.0.0", port, handlers) + val (srv, bPort) = JettyUtils.startJettyServer(host, port, handlers) server = Some(srv) boundPort = Some(bPort) logInfo("Started Worker web UI at http://%s:%d".format(host, bPort)) diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 7211dbc7c6681..b3deb41e761c8 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -30,12 +30,14 @@ import org.eclipse.jetty.server.handler.{ResourceHandler, HandlerList, ContextHa import org.eclipse.jetty.util.thread.QueuedThreadPool import org.apache.spark.Logging +import java.net.InetSocketAddress /** Utilities for launching a web server using Jetty's HTTP Server class */ private[spark] object JettyUtils extends Logging { // Base type for a function that returns something based on an HTTP request. Allows for // implicit conversion from many types of functions to jetty Handlers. + type Responder[T] = HttpServletRequest => T // Conversions from various types of Responder's to jetty Handlers @@ -92,12 +94,13 @@ private[spark] object JettyUtils extends Logging { } /** - * Attempts to start a Jetty server at the supplied ip:port which uses the supplied handlers. + * Attempts to start a Jetty server at the supplied hostName:port which uses the supplied handlers. * * If the desired port number is contented, continues incrementing ports until a free port is * found. Returns the chosen port and the jetty Server object. */ - def startJettyServer(ip: String, port: Int, handlers: Seq[(String, Handler)]): (Server, Int) = { + def startJettyServer(hostName: String, port: Int, handlers: Seq[(String, Handler)]): (Server, Int) = { + val handlersToRegister = handlers.map { case(path, handler) => val contextHandler = new ContextHandler(path) contextHandler.setHandler(handler) @@ -109,7 +112,7 @@ private[spark] object JettyUtils extends Logging { @tailrec def connect(currentPort: Int): (Server, Int) = { - val server = new Server(currentPort) + val server = new Server(new InetSocketAddress(hostName, currentPort)) val pool = new QueuedThreadPool pool.setDaemon(true) server.setThreadPool(pool) diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala index 50dfdbdf5ae9b..0196f43d7431b 100644 --- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala @@ -54,7 +54,7 @@ private[spark] class SparkUI(sc: SparkContext) extends Logging { /** Bind the HTTP server which backs this web interface */ def bind() { try { - val (srv, usedPort) = JettyUtils.startJettyServer("0.0.0.0", port, allHandlers) + val (srv, usedPort) = JettyUtils.startJettyServer(host, port, allHandlers) logInfo("Started Spark Web UI at http://%s:%d".format(host, usedPort)) server = Some(srv) boundPort = Some(usedPort) diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala index 3764f4d1a0c9b..c17bbfe7d35ba 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala @@ -32,15 +32,15 @@ class UISuite extends FunSuite { case Failure(e) => // Either case server port is busy hence setup for test complete } - val (jettyServer1, boundPort1) = JettyUtils.startJettyServer("localhost", startPort, Seq()) - val (jettyServer2, boundPort2) = JettyUtils.startJettyServer("localhost", startPort, Seq()) + val (jettyServer1, boundPort1) = JettyUtils.startJettyServer("0.0.0.0", startPort, Seq()) + val (jettyServer2, boundPort2) = JettyUtils.startJettyServer("0.0.0.0", startPort, Seq()) // Allow some wiggle room in case ports on the machine are under contention assert(boundPort1 > startPort && boundPort1 < startPort + 10) assert(boundPort2 > boundPort1 && boundPort2 < boundPort1 + 10) } test("jetty binds to port 0 correctly") { - val (jettyServer, boundPort) = JettyUtils.startJettyServer("localhost", 0, Seq()) + val (jettyServer, boundPort) = JettyUtils.startJettyServer("0.0.0.0", 0, Seq()) assert(jettyServer.getState === "STARTED") assert(boundPort != 0) Try {new ServerSocket(boundPort)} match { From b69f8b2a01669851c656739b6886efe4cddef31a Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sun, 9 Feb 2014 10:09:19 -0800 Subject: [PATCH 009/127] Merge pull request #557 from ScrapCodes/style. Closes #557. SPARK-1058, Fix Style Errors and Add Scala Style to Spark Build. Author: Patrick Wendell Author: Prashant Sharma == Merge branch commits == commit 1a8bd1c059b842cb95cc246aaea74a79fec684f4 Author: Prashant Sharma Date: Sun Feb 9 17:39:07 2014 +0530 scala style fixes commit f91709887a8e0b608c5c2b282db19b8a44d53a43 Author: Patrick Wendell Date: Fri Jan 24 11:22:53 2014 -0800 Adding scalastyle snapshot --- .../scala/org/apache/spark/bagel/Bagel.scala | 55 ++++---- .../scala/org/apache/spark/CacheManager.scala | 4 +- .../apache/spark/FetchFailedException.scala | 3 +- .../org/apache/spark/MapOutputTracker.scala | 2 +- .../scala/org/apache/spark/SparkContext.scala | 26 ++-- .../scala/org/apache/spark/SparkEnv.scala | 7 +- .../apache/spark/api/java/JavaDoubleRDD.scala | 4 +- .../apache/spark/api/java/JavaPairRDD.scala | 40 +++--- .../apache/spark/api/java/JavaRDDLike.scala | 18 ++- .../spark/api/java/JavaSparkContext.scala | 8 +- .../spark/api/python/PythonPartitioner.scala | 3 +- .../apache/spark/api/python/PythonRDD.scala | 13 +- .../spark/broadcast/TorrentBroadcast.scala | 3 +- .../apache/spark/deploy/ClientArguments.scala | 20 +-- .../spark/deploy/FaultToleranceTest.scala | 29 ++-- .../spark/deploy/LocalSparkCluster.scala | 3 +- .../spark/deploy/client/AppClient.scala | 3 +- .../deploy/client/AppClientListener.scala | 3 +- .../apache/spark/deploy/master/Master.scala | 18 ++- .../spark/deploy/master/ui/IndexPage.scala | 36 ++--- .../spark/deploy/worker/CommandUtils.scala | 3 +- .../spark/deploy/worker/DriverWrapper.scala | 2 +- .../spark/deploy/worker/WorkerWatcher.scala | 2 +- .../spark/deploy/worker/ui/IndexPage.scala | 12 +- .../spark/deploy/worker/ui/WorkerWebUI.scala | 2 +- .../org/apache/spark/executor/Executor.scala | 5 +- .../spark/executor/ExecutorExitCode.scala | 5 +- .../spark/executor/ExecutorSource.scala | 3 +- .../apache/spark/executor/TaskMetrics.scala | 3 +- .../apache/spark/metrics/MetricsSystem.scala | 3 +- .../org/apache/spark/network/Connection.scala | 10 +- .../spark/network/ConnectionManager.scala | 68 ++++++---- .../spark/network/ConnectionManagerTest.scala | 23 ++-- .../org/apache/spark/network/SenderTest.scala | 19 +-- .../org/apache/spark/rdd/CheckpointRDD.scala | 3 +- .../org/apache/spark/rdd/CoalescedRDD.scala | 9 +- .../apache/spark/rdd/DoubleRDDFunctions.scala | 4 +- .../apache/spark/rdd/PairRDDFunctions.scala | 2 +- .../spark/rdd/ParallelCollectionRDD.scala | 3 +- .../spark/rdd/PartitionerAwareUnionRDD.scala | 4 +- .../apache/spark/rdd/RDDCheckpointData.scala | 10 +- .../spark/rdd/SequenceFileRDDFunctions.scala | 3 +- .../apache/spark/scheduler/DAGScheduler.scala | 3 +- .../spark/scheduler/InputFormatInfo.scala | 32 +++-- .../apache/spark/scheduler/JobLogger.scala | 10 +- .../apache/spark/scheduler/JobResult.scala | 3 +- .../apache/spark/scheduler/ResultTask.scala | 6 +- .../spark/scheduler/SchedulableBuilder.scala | 2 +- .../spark/scheduler/SparkListener.scala | 30 +++-- .../org/apache/spark/scheduler/Stage.scala | 3 +- .../apache/spark/scheduler/StageInfo.scala | 3 +- .../org/apache/spark/scheduler/TaskInfo.scala | 11 +- .../apache/spark/scheduler/TaskResult.scala | 3 +- .../spark/scheduler/TaskScheduler.scala | 3 +- .../spark/scheduler/TaskSetManager.scala | 2 +- .../CoarseGrainedSchedulerBackend.scala | 3 +- .../cluster/SparkDeploySchedulerBackend.scala | 7 +- .../mesos/CoarseMesosSchedulerBackend.scala | 3 +- .../spark/serializer/KryoSerializer.scala | 3 +- .../apache/spark/serializer/Serializer.scala | 7 +- .../spark/storage/BlockFetcherIterator.scala | 6 +- .../apache/spark/storage/BlockManager.scala | 16 ++- .../storage/BlockManagerMasterActor.scala | 8 +- .../spark/storage/BlockManagerSource.scala | 6 +- .../spark/storage/BlockMessageArray.scala | 6 +- .../apache/spark/storage/StorageUtils.scala | 19 +-- .../scala/org/apache/spark/ui/UIUtils.scala | 15 ++- .../apache/spark/ui/UIWorkloadGenerator.scala | 3 +- .../apache/spark/ui/exec/ExecutorsUI.scala | 6 +- .../org/apache/spark/ui/jobs/IndexPage.scala | 3 +- .../org/apache/spark/ui/jobs/PoolTable.scala | 5 +- .../org/apache/spark/ui/jobs/StagePage.scala | 18 ++- .../org/apache/spark/ui/jobs/StageTable.scala | 3 +- .../apache/spark/util/ClosureCleaner.scala | 11 +- .../spark/util/CompletionIterator.scala | 5 +- .../org/apache/spark/util/Distribution.scala | 8 +- .../apache/spark/util/MetadataCleaner.scala | 3 +- .../spark/util/SerializableHyperLogLog.scala | 3 +- .../org/apache/spark/util/SizeEstimator.scala | 20 +-- .../org/apache/spark/util/StatCounter.scala | 15 ++- .../scala/org/apache/spark/util/Utils.scala | 6 +- .../scala/org/apache/spark/util/Vector.scala | 21 ++- .../apache/spark/util/collection/BitSet.scala | 2 +- .../collection/ExternalAppendOnlyMap.scala | 2 +- .../spark/util/collection/OpenHashSet.scala | 2 +- .../examples/StatefulNetworkWordCount.scala | 13 +- .../examples/TwitterAlgebirdHLL.scala | 19 ++- .../clickstream/PageViewGenerator.scala | 4 +- .../examples/clickstream/PageViewStream.scala | 3 +- .../streaming/kafka/KafkaInputDStream.scala | 14 +- .../spark/streaming/kafka/KafkaUtils.scala | 3 +- .../streaming/zeromq/ZeroMQReceiver.scala | 4 +- .../spark/streaming/zeromq/ZeroMQUtils.scala | 25 ++-- .../org/apache/spark/graphx/EdgeRDD.scala | 7 +- .../scala/org/apache/spark/graphx/Graph.scala | 19 +-- .../spark/graphx/PartitionStrategy.scala | 14 +- .../org/apache/spark/graphx/VertexRDD.scala | 23 ++-- .../apache/spark/graphx/lib/Analytics.scala | 2 +- .../spark/graphx/util/GraphGenerators.scala | 31 ++--- .../mllib/api/python/PythonMLLibAPI.scala | 3 +- .../org/apache/spark/mllib/linalg/SVD.scala | 8 +- .../mllib/optimization/GradientDescent.scala | 2 +- .../spark/mllib/recommendation/ALS.scala | 20 +-- project/SparkBuild.scala | 4 +- project/build.properties | 2 +- project/plugins.sbt | 11 +- project/project/SparkPluginBuild.scala | 2 +- .../spark/repl/ExecutorClassLoader.scala | 15 ++- .../apache/spark/repl/SparkExprTyper.scala | 2 + .../org/apache/spark/repl/SparkILoop.scala | 2 + .../apache/spark/repl/SparkILoopInit.scala | 2 + .../org/apache/spark/repl/SparkIMain.scala | 2 + .../org/apache/spark/repl/SparkImports.scala | 2 + .../spark/repl/SparkJLineCompletion.scala | 2 + .../apache/spark/repl/SparkJLineReader.scala | 2 + .../spark/repl/SparkMemberHandlers.scala | 2 + scalastyle-config.xml | 126 ++++++++++++++++++ .../streaming/util/MasterFailureTest.scala | 17 ++- .../tools/JavaAPICompletenessChecker.scala | 6 +- 119 files changed, 795 insertions(+), 460 deletions(-) create mode 100644 scalastyle-config.xml diff --git a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala index 44e26bbb9e094..281216612fc19 100644 --- a/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala +++ b/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala @@ -28,21 +28,22 @@ object Bagel extends Logging { /** * Runs a Bagel program. * @param sc [[org.apache.spark.SparkContext]] to use for the program. - * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the Key will be - * the vertex id. - * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often this will be an - * empty array, i.e. sc.parallelize(Array[K, Message]()). - * @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a given vertex into one - * message before sending (which often involves network I/O). - * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices after each superstep, - * and provides the result to each vertex in the next superstep. + * @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the + * Key will be the vertex id. + * @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often + * this will be an empty array, i.e. sc.parallelize(Array[K, Message]()). + * @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a + * given vertex into one message before sending (which often involves network I/O). + * @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices + * after each superstep and provides the result to each vertex in the next + * superstep. * @param partitioner [[org.apache.spark.Partitioner]] partitions values by key * @param numPartitions number of partitions across which to split the graph. * Default is the default parallelism of the SparkContext - * @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of intermediate RDDs in each superstep. - * Defaults to caching in memory. - * @param compute function that takes a Vertex, optional set of (possibly combined) messages to the Vertex, - * optional Aggregator and the current superstep, + * @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of + * intermediate RDDs in each superstep. Defaults to caching in memory. + * @param compute function that takes a Vertex, optional set of (possibly combined) messages to + * the Vertex, optional Aggregator and the current superstep, * and returns a set of (Vertex, outgoing Messages) pairs * @tparam K key * @tparam V vertex type @@ -71,7 +72,7 @@ object Bagel extends Logging { var msgs = messages var noActivity = false do { - logInfo("Starting superstep "+superstep+".") + logInfo("Starting superstep " + superstep + ".") val startTime = System.currentTimeMillis val aggregated = agg(verts, aggregator) @@ -97,7 +98,8 @@ object Bagel extends Logging { verts } - /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default storage level */ + /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default + * storage level */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( sc: SparkContext, vertices: RDD[(K, V)], @@ -106,8 +108,8 @@ object Bagel extends Logging { partitioner: Partitioner, numPartitions: Int )( - compute: (V, Option[C], Int) => (V, Array[M]) - ): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) + compute: (V, Option[C], Int) => (V, Array[M])): RDD[(K, V)] = run(sc, vertices, messages, + combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( @@ -127,8 +129,8 @@ object Bagel extends Logging { } /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]] - * and default storage level + * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default + * [[org.apache.spark.HashPartitioner]] and default storage level */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( sc: SparkContext, @@ -138,9 +140,13 @@ object Bagel extends Logging { numPartitions: Int )( compute: (V, Option[C], Int) => (V, Array[M]) - ): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) + ): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, + DEFAULT_STORAGE_LEVEL)(compute) - /** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default [[org.apache.spark.HashPartitioner]]*/ + /** + * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the + * default [[org.apache.spark.HashPartitioner]] + */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( sc: SparkContext, vertices: RDD[(K, V)], @@ -158,7 +164,8 @@ object Bagel extends Logging { } /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]], + * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], + * default [[org.apache.spark.HashPartitioner]], * [[org.apache.spark.bagel.DefaultCombiner]] and the default storage level */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( @@ -171,7 +178,8 @@ object Bagel extends Logging { ): RDD[(K, V)] = run(sc, vertices, messages, numPartitions, DEFAULT_STORAGE_LEVEL)(compute) /** - * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], the default [[org.apache.spark.HashPartitioner]] + * Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], + * the default [[org.apache.spark.HashPartitioner]] * and [[org.apache.spark.bagel.DefaultCombiner]] */ def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( @@ -227,8 +235,9 @@ object Bagel extends Logging { }) numMsgs += newMsgs.size - if (newVert.active) + if (newVert.active) { numActiveVerts += 1 + } Some((newVert, newMsgs)) }.persist(storageLevel) diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala index 8e5dd8a85020d..15a0d24fd954e 100644 --- a/core/src/main/scala/org/apache/spark/CacheManager.scala +++ b/core/src/main/scala/org/apache/spark/CacheManager.scala @@ -31,8 +31,8 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging { private val loading = new HashSet[RDDBlockId]() /** Gets or computes an RDD split. Used by RDD.iterator() when an RDD is cached. */ - def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, storageLevel: StorageLevel) - : Iterator[T] = { + def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, + storageLevel: StorageLevel): Iterator[T] = { val key = RDDBlockId(rdd.id, split.index) logDebug("Looking for partition " + key) blockManager.get(key) match { diff --git a/core/src/main/scala/org/apache/spark/FetchFailedException.scala b/core/src/main/scala/org/apache/spark/FetchFailedException.scala index d242047502fd3..8eaa26bdb1b5b 100644 --- a/core/src/main/scala/org/apache/spark/FetchFailedException.scala +++ b/core/src/main/scala/org/apache/spark/FetchFailedException.scala @@ -25,7 +25,8 @@ private[spark] class FetchFailedException( cause: Throwable) extends Exception { - def this (bmAddress: BlockManagerId, shuffleId: Int, mapId: Int, reduceId: Int, cause: Throwable) = + def this (bmAddress: BlockManagerId, shuffleId: Int, mapId: Int, reduceId: Int, + cause: Throwable) = this(FetchFailed(bmAddress, shuffleId, mapId, reduceId), "Fetch failed: %s %d %d %d".format(bmAddress, shuffleId, mapId, reduceId), cause) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 30d182b008930..8d6db0fca23f7 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -29,7 +29,7 @@ import akka.pattern.ask import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.BlockManagerId -import org.apache.spark.util.{AkkaUtils, MetadataCleaner, MetadataCleanerType, TimeStampedHashMap, Utils} +import org.apache.spark.util.{AkkaUtils, MetadataCleaner, MetadataCleanerType, TimeStampedHashMap} private[spark] sealed trait MapOutputTrackerMessage private[spark] case class GetMapOutputStatuses(shuffleId: Int) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 566472e597958..25f7a5ed1c250 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -63,9 +63,9 @@ import org.apache.spark.util.{Utils, TimeStampedHashMap, MetadataCleaner, Metada */ class SparkContext( config: SparkConf, - // This is used only by YARN for now, but should be relevant to other cluster types (Mesos, etc) - // too. This is typically generated from InputFormatInfo.computePreferredLocations. It contains - // a map from hostname to a list of input format splits on the host. + // This is used only by YARN for now, but should be relevant to other cluster types (Mesos, + // etc) too. This is typically generated from InputFormatInfo.computePreferredLocations. It + // contains a map from hostname to a list of input format splits on the host. val preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()) extends Logging { @@ -552,10 +552,11 @@ class SparkContext( /** * Load an RDD saved as a SequenceFile containing serialized objects, with NullWritable keys and - * BytesWritable values that contain a serialized partition. This is still an experimental storage - * format and may not be supported exactly as is in future Spark releases. It will also be pretty - * slow if you use the default serializer (Java serialization), though the nice thing about it is - * that there's very little effort required to save arbitrary objects. + * BytesWritable values that contain a serialized partition. This is still an experimental + * storage format and may not be supported exactly as is in future Spark releases. It will also + * be pretty slow if you use the default serializer (Java serialization), + * though the nice thing about it is that there's very little effort required to save arbitrary + * objects. */ def objectFile[T: ClassTag]( path: String, @@ -1043,7 +1044,7 @@ object SparkContext { implicit object LongAccumulatorParam extends AccumulatorParam[Long] { def addInPlace(t1: Long, t2: Long) = t1 + t2 - def zero(initialValue: Long) = 0l + def zero(initialValue: Long) = 0L } implicit object FloatAccumulatorParam extends AccumulatorParam[Float] { @@ -1109,7 +1110,8 @@ object SparkContext { implicit def floatWritableConverter() = simpleWritableConverter[Float, FloatWritable](_.get) - implicit def booleanWritableConverter() = simpleWritableConverter[Boolean, BooleanWritable](_.get) + implicit def booleanWritableConverter() = + simpleWritableConverter[Boolean, BooleanWritable](_.get) implicit def bytesWritableConverter() = { simpleWritableConverter[Array[Byte], BytesWritable](_.getBytes) @@ -1258,7 +1260,8 @@ object SparkContext { case "yarn-client" => val scheduler = try { - val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") + val clazz = + Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") val cons = clazz.getConstructor(classOf[SparkContext]) cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl] @@ -1269,7 +1272,8 @@ object SparkContext { } val backend = try { - val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend") + val clazz = + Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend") val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext]) cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend] } catch { diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index ed788560e79f1..6ae020f6a21b1 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -96,7 +96,7 @@ object SparkEnv extends Logging { @volatile private var lastSetSparkEnv : SparkEnv = _ def set(e: SparkEnv) { - lastSetSparkEnv = e + lastSetSparkEnv = e env.set(e) } @@ -112,7 +112,7 @@ object SparkEnv extends Logging { * Returns the ThreadLocal SparkEnv. */ def getThreadLocal: SparkEnv = { - env.get() + env.get() } private[spark] def create( @@ -168,7 +168,8 @@ object SparkEnv extends Logging { val blockManagerMaster = new BlockManagerMaster(registerOrLookup( "BlockManagerMaster", new BlockManagerMasterActor(isLocal, conf)), conf) - val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, serializer, conf) + val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, + serializer, conf) val connectionManager = blockManager.connectionManager diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala index b0dedc6f4eb13..33737e1960ade 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala @@ -148,8 +148,8 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav def sum(): Double = srdd.sum() /** - * Return a [[org.apache.spark.util.StatCounter]] object that captures the mean, variance and count - * of the RDD's elements in one operation. + * Return a [[org.apache.spark.util.StatCounter]] object that captures the mean, variance and + * count of the RDD's elements in one operation. */ def stats(): StatCounter = srdd.stats() diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index f430a33db1e4a..5b1bf9476e4d5 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -88,7 +88,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K /** * Return a new RDD containing the distinct elements in this RDD. */ - def distinct(numPartitions: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.distinct(numPartitions)) + def distinct(numPartitions: Int): JavaPairRDD[K, V] = + new JavaPairRDD[K, V](rdd.distinct(numPartitions)) /** * Return a new RDD containing only the elements that satisfy a predicate. @@ -210,25 +211,25 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K rdd.countByKeyApprox(timeout, confidence).map(mapAsJavaMap) /** - * Merge the values for each key using an associative function and a neutral "zero value" which may - * be added to the result an arbitrary number of times, and must not change the result (e.g., Nil for - * list concatenation, 0 for addition, or 1 for multiplication.). + * Merge the values for each key using an associative function and a neutral "zero value" which + * may be added to the result an arbitrary number of times, and must not change the result + * (e.g ., Nil for list concatenation, 0 for addition, or 1 for multiplication.). */ - def foldByKey(zeroValue: V, partitioner: Partitioner, func: JFunction2[V, V, V]): JavaPairRDD[K, V] = - fromRDD(rdd.foldByKey(zeroValue, partitioner)(func)) + def foldByKey(zeroValue: V, partitioner: Partitioner, func: JFunction2[V, V, V]) + : JavaPairRDD[K, V] = fromRDD(rdd.foldByKey(zeroValue, partitioner)(func)) /** - * Merge the values for each key using an associative function and a neutral "zero value" which may - * be added to the result an arbitrary number of times, and must not change the result (e.g., Nil for - * list concatenation, 0 for addition, or 1 for multiplication.). + * Merge the values for each key using an associative function and a neutral "zero value" which + * may be added to the result an arbitrary number of times, and must not change the result + * (e.g ., Nil for list concatenation, 0 for addition, or 1 for multiplication.). */ def foldByKey(zeroValue: V, numPartitions: Int, func: JFunction2[V, V, V]): JavaPairRDD[K, V] = fromRDD(rdd.foldByKey(zeroValue, numPartitions)(func)) /** - * Merge the values for each key using an associative function and a neutral "zero value" which may - * be added to the result an arbitrary number of times, and must not change the result (e.g., Nil for - * list concatenation, 0 for addition, or 1 for multiplication.). + * Merge the values for each key using an associative function and a neutral "zero value" + * which may be added to the result an arbitrary number of times, and must not change the result + * (e.g., Nil for list concatenation, 0 for addition, or 1 for multiplication.). */ def foldByKey(zeroValue: V, func: JFunction2[V, V, V]): JavaPairRDD[K, V] = fromRDD(rdd.foldByKey(zeroValue)(func)) @@ -375,7 +376,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output * into `numPartitions` partitions. */ - def leftOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (V, Optional[W])] = { + def leftOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int) + : JavaPairRDD[K, (V, Optional[W])] = { val joinResult = rdd.leftOuterJoin(other, numPartitions) fromRDD(joinResult.mapValues{case (v, w) => (v, JavaUtils.optionToOptional(w))}) } @@ -397,7 +399,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K * pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting * RDD into the given number of partitions. */ - def rightOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (Optional[V], W)] = { + def rightOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int) + : JavaPairRDD[K, (Optional[V], W)] = { val joinResult = rdd.rightOuterJoin(other, numPartitions) fromRDD(joinResult.mapValues{case (v, w) => (JavaUtils.optionToOptional(v), w)}) } @@ -439,8 +442,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K * For each key k in `this` or `other1` or `other2`, return a resulting RDD that contains a * tuple with the list of values for that key in `this`, `other1` and `other2`. */ - def cogroup[W1, W2](other1: JavaPairRDD[K, W1], other2: JavaPairRDD[K, W2], partitioner: Partitioner) - : JavaPairRDD[K, (JList[V], JList[W1], JList[W2])] = + def cogroup[W1, W2](other1: JavaPairRDD[K, W1], other2: JavaPairRDD[K, W2], + partitioner: Partitioner): JavaPairRDD[K, (JList[V], JList[W1], JList[W2])] = fromRDD(cogroupResult2ToJava(rdd.cogroup(other1, other2, partitioner))) /** @@ -462,8 +465,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K * For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the * list of values for that key in `this` as well as `other`. */ - def cogroup[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (JList[V], JList[W])] - = fromRDD(cogroupResultToJava(rdd.cogroup(other, numPartitions))) + def cogroup[W](other: JavaPairRDD[K, W], numPartitions: Int) + : JavaPairRDD[K, (JList[V], JList[W])] = + fromRDD(cogroupResultToJava(rdd.cogroup(other, numPartitions))) /** * For each key k in `this` or `other1` or `other2`, return a resulting RDD that contains a diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 4db7339e6716b..fcb9729c10a6d 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -76,7 +76,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { f: JFunction2[Int, java.util.Iterator[T], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))), - preservesPartitioning)) + preservesPartitioning)) /** * Return a new RDD by applying a function to all elements of this RDD. @@ -134,7 +134,8 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Return a new RDD by applying a function to each partition of this RDD. */ - def mapPartitions[U](f: FlatMapFunction[java.util.Iterator[T], U], preservesPartitioning: Boolean): JavaRDD[U] = { + def mapPartitions[U](f: FlatMapFunction[java.util.Iterator[T], U], + preservesPartitioning: Boolean): JavaRDD[U] = { def fn = (x: Iterator[T]) => asScalaIterator(f.apply(asJavaIterator(x)).iterator()) JavaRDD.fromRDD(rdd.mapPartitions(fn, preservesPartitioning)(f.elementType()))(f.elementType()) } @@ -160,16 +161,18 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Return a new RDD by applying a function to each partition of this RDD. */ - def mapPartitions(f: DoubleFlatMapFunction[java.util.Iterator[T]], preservesPartitioning: Boolean): JavaDoubleRDD = { + def mapPartitions(f: DoubleFlatMapFunction[java.util.Iterator[T]], + preservesPartitioning: Boolean): JavaDoubleRDD = { def fn = (x: Iterator[T]) => asScalaIterator(f.apply(asJavaIterator(x)).iterator()) - new JavaDoubleRDD(rdd.mapPartitions(fn, preservesPartitioning).map((x: java.lang.Double) => x.doubleValue())) + new JavaDoubleRDD(rdd.mapPartitions(fn, preservesPartitioning) + .map((x: java.lang.Double) => x.doubleValue())) } /** * Return a new RDD by applying a function to each partition of this RDD. */ - def mapPartitions[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2], preservesPartitioning: Boolean): - JavaPairRDD[K2, V2] = { + def mapPartitions[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2], + preservesPartitioning: Boolean): JavaPairRDD[K2, V2] = { def fn = (x: Iterator[T]) => asScalaIterator(f.apply(asJavaIterator(x)).iterator()) JavaPairRDD.fromRDD(rdd.mapPartitions(fn, preservesPartitioning))(f.keyType(), f.valueType()) } @@ -294,7 +297,8 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { } /** - * Reduces the elements of this RDD using the specified commutative and associative binary operator. + * Reduces the elements of this RDD using the specified commutative and associative binary + * operator. */ def reduce(f: JFunction2[T, T, T]): T = rdd.reduce(f) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 5a426b983519c..22dc9c9e2ecfe 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -362,15 +362,15 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork doubleAccumulator(initialValue) /** - * Create an [[org.apache.spark.Accumulator]] variable of a given type, which tasks can "add" values - * to using the `add` method. Only the master can access the accumulator's `value`. + * Create an [[org.apache.spark.Accumulator]] variable of a given type, which tasks can "add" + * values to using the `add` method. Only the master can access the accumulator's `value`. */ def accumulator[T](initialValue: T, accumulatorParam: AccumulatorParam[T]): Accumulator[T] = sc.accumulator(initialValue)(accumulatorParam) /** - * Create an [[org.apache.spark.Accumulable]] shared variable of the given type, to which tasks can - * "add" values with `add`. Only the master can access the accumuable's `value`. + * Create an [[org.apache.spark.Accumulable]] shared variable of the given type, to which tasks + * can "add" values with `add`. Only the master can access the accumuable's `value`. */ def accumulable[T, R](initialValue: T, param: AccumulableParam[T, R]): Accumulable[T, R] = sc.accumulable(initialValue)(param) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala index 2be4e323bec98..35eca62ecd586 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala @@ -23,7 +23,8 @@ import org.apache.spark.Partitioner import org.apache.spark.util.Utils /** - * A [[org.apache.spark.Partitioner]] that performs handling of long-valued keys, for use by the Python API. + * A [[org.apache.spark.Partitioner]] that performs handling of long-valued keys, for use by the + * Python API. * * Stores the unique id() of the Python-side partitioning function so that it is incorporated into * equality comparisons. Correctness requires that the id is a unique identifier for the diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 9cbd26b607b24..33667a998ed41 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -91,8 +91,9 @@ private[spark] class PythonRDD[T: ClassTag]( // Kill the Python worker process: worker.shutdownOutput() case e: IOException => - // This can happen for legitimate reasons if the Python code stops returning data before we are done - // passing elements through, e.g., for take(). Just log a message to say it happened. + // This can happen for legitimate reasons if the Python code stops returning data + // before we are done passing elements through, e.g., for take(). Just log a message + // to say it happened. logInfo("stdin writer to Python finished early") logDebug("stdin writer to Python finished early", e) } @@ -132,7 +133,8 @@ private[spark] class PythonRDD[T: ClassTag]( val init = initTime - bootTime val finish = finishTime - initTime val total = finishTime - startTime - logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot, init, finish)) + logInfo("Times: total = %s, boot = %s, init = %s, finish = %s".format(total, boot, + init, finish)) read case SpecialLengths.PYTHON_EXCEPTION_THROWN => // Signals that an exception has been thrown in python @@ -184,7 +186,7 @@ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends override def compute(split: Partition, context: TaskContext) = prev.iterator(split, context).grouped(2).map { case Seq(a, b) => (Utils.deserializeLongValue(a), b) - case x => throw new SparkException("PairwiseRDD: unexpected value: " + x) + case x => throw new SparkException("PairwiseRDD: unexpected value: " + x) } val asJavaPairRDD : JavaPairRDD[Long, Array[Byte]] = JavaPairRDD.fromRDD(this) } @@ -274,7 +276,8 @@ private[spark] object PythonRDD { } -private class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] { +private +class BytesToString extends org.apache.spark.api.java.function.Function[Array[Byte], String] { override def call(arr: Array[Byte]) : String = new String(arr, "UTF-8") } diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala index d351dfc1f56a2..ec997255d59a1 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala @@ -187,8 +187,9 @@ extends Logging { val bais = new ByteArrayInputStream(byteArray) var blockNum = (byteArray.length / BLOCK_SIZE) - if (byteArray.length % BLOCK_SIZE != 0) + if (byteArray.length % BLOCK_SIZE != 0) { blockNum += 1 + } var retVal = new Array[TorrentBlock](blockNum) var blockID = 0 diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala index db67c6d1bb55c..3db970ca73b92 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala @@ -101,16 +101,16 @@ private[spark] class ClientArguments(args: Array[String]) { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = - s""" - |Usage: DriverClient [options] launch [driver options] - |Usage: DriverClient kill - | - |Options: - | -c CORES, --cores CORES Number of cores to request (default: $defaultCores) - | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $defaultMemory) - | -s, --supervise Whether to restart the driver on failure - | -v, --verbose Print more debugging output - """.stripMargin + s""" + |Usage: DriverClient [options] launch [driver options] + |Usage: DriverClient kill + | + |Options: + | -c CORES, --cores CORES Number of cores to request (default: $defaultCores) + | -m MEMORY, --memory MEMORY Megabytes of memory to request (default: $defaultMemory) + | -s, --supervise Whether to restart the driver on failure + | -v, --verbose Print more debugging output + """.stripMargin System.err.println(usage) System.exit(exitCode) } diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala index 4dfb19ed8adb6..7de7c4864ee54 100644 --- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala +++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala @@ -1,20 +1,18 @@ /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.spark.deploy @@ -306,7 +304,8 @@ private[spark] object FaultToleranceTest extends App with Logging { } } - logInfo("Ran %s tests, %s passed and %s failed".format(numPassed+numFailed, numPassed, numFailed)) + logInfo("Ran %s tests, %s passed and %s failed".format(numPassed + numFailed, numPassed, + numFailed)) } private[spark] class TestMasterInfo(val ip: String, val dockerId: DockerId, val logFile: File) diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala index ffc0cb09032fb..488843a32c167 100644 --- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala +++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala @@ -33,7 +33,8 @@ import scala.collection.mutable.ArrayBuffer * fault recovery without spinning up a lot of processes. */ private[spark] -class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: Int) extends Logging { +class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: Int) + extends Logging { private val localHostname = Utils.localHostName() private val masterActorSystems = ArrayBuffer[ActorSystem]() diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 1415e2f3d1886..8901806de9262 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -132,7 +132,8 @@ private[spark] class AppClient( case ExecutorAdded(id: Int, workerId: String, hostPort: String, cores: Int, memory: Int) => val fullId = appId + "/" + id - logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, hostPort, cores)) + logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, hostPort, + cores)) listener.executorAdded(fullId, workerId, hostPort, cores, memory) case ExecutorUpdated(id, state, message, exitStatus) => diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClientListener.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClientListener.scala index 55d4ef1b31aaa..2f2cbd182c967 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClientListener.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClientListener.scala @@ -33,7 +33,8 @@ private[spark] trait AppClientListener { /** Dead means that we couldn't find any Masters to connect to, and have given up. */ def dead(): Unit - def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, memory: Int): Unit + def executorAdded( + fullId: String, workerId: String, hostPort: String, cores: Int, memory: Int): Unit def executorRemoved(fullId: String, message: String, exitStatus: Option[Int]): Unit } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 2ef167ffc00f0..82bf655212fcc 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -149,10 +149,11 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act override def receive = { case ElectedLeader => { val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData() - state = if (storedApps.isEmpty && storedDrivers.isEmpty && storedWorkers.isEmpty) + state = if (storedApps.isEmpty && storedDrivers.isEmpty && storedWorkers.isEmpty) { RecoveryState.ALIVE - else + } else { RecoveryState.RECOVERING + } logInfo("I have been elected leader! New state: " + state) if (state == RecoveryState.RECOVERING) { beginRecovery(storedApps, storedDrivers, storedWorkers) @@ -165,7 +166,8 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act System.exit(0) } - case RegisterWorker(id, workerHost, workerPort, cores, memory, workerWebUiPort, publicAddress) => { + case RegisterWorker(id, workerHost, workerPort, cores, memory, workerWebUiPort, publicAddress) + => { logInfo("Registering worker %s:%d with %d cores, %s RAM".format( host, workerPort, cores, Utils.megabytesToString(memory))) if (state == RecoveryState.STANDBY) { @@ -181,9 +183,10 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act schedule() } else { val workerAddress = worker.actor.path.address - logWarning("Worker registration failed. Attempted to re-register worker at same address: " + - workerAddress) - sender ! RegisterWorkerFailed("Attempted to re-register worker at same address: " + workerAddress) + logWarning("Worker registration failed. Attempted to re-register worker at same " + + "address: " + workerAddress) + sender ! RegisterWorkerFailed("Attempted to re-register worker at same address: " + + workerAddress) } } } @@ -641,8 +644,9 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act worker.id, WORKER_TIMEOUT/1000)) removeWorker(worker) } else { - if (worker.lastHeartbeat < currentTime - ((REAPER_ITERATIONS + 1) * WORKER_TIMEOUT)) + if (worker.lastHeartbeat < currentTime - ((REAPER_ITERATIONS + 1) * WORKER_TIMEOUT)) { workers -= worker // we've seen this DEAD worker in the UI, etc. for long enough; cull it + } } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/IndexPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/IndexPage.scala index a9af8df5525d6..64ecf22399e39 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/IndexPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/IndexPage.scala @@ -57,7 +57,8 @@ private[spark] class IndexPage(parent: MasterWebUI) { val completedApps = state.completedApps.sortBy(_.endTime).reverse val completedAppsTable = UIUtils.listingTable(appHeaders, appRow, completedApps) - val driverHeaders = Seq("ID", "Submitted Time", "Worker", "State", "Cores", "Memory", "Main Class") + val driverHeaders = Seq("ID", "Submitted Time", "Worker", "State", "Cores", "Memory", + "Main Class") val activeDrivers = state.activeDrivers.sortBy(_.startTime).reverse val activeDriversTable = UIUtils.listingTable(driverHeaders, driverRow, activeDrivers) val completedDrivers = state.completedDrivers.sortBy(_.startTime).reverse @@ -103,13 +104,14 @@ private[spark] class IndexPage(parent: MasterWebUI) {
- {if (hasDrivers) -
-
-

Running Drivers

- {activeDriversTable} -
-
+ {if (hasDrivers) { +
+
+

Running Drivers

+ {activeDriversTable} +
+
+ } }
@@ -121,13 +123,14 @@ private[spark] class IndexPage(parent: MasterWebUI) {
- {if (hasDrivers) -
-
-

Completed Drivers

- {completedDriversTable} -
-
+ {if (hasDrivers) { +
+
+

Completed Drivers

+ {completedDriversTable} +
+
+ } }
; @@ -175,7 +178,8 @@ private[spark] class IndexPage(parent: MasterWebUI) { {driver.id} {driver.submitDate} - {driver.worker.map(w =>
{w.id.toString}).getOrElse("None")} + {driver.worker.map(w => {w.id.toString}).getOrElse("None")} + {driver.state} {driver.desc.cores} diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala index 460883ec7ae24..f411eb9cec89f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala @@ -49,7 +49,8 @@ object CommandUtils extends Logging { val libraryOpts = getEnv("SPARK_LIBRARY_PATH", command) .map(p => List("-Djava.library.path=" + p)) .getOrElse(Nil) - val workerLocalOpts = Option(getenv("SPARK_JAVA_OPTS")).map(Utils.splitCommandString).getOrElse(Nil) + val workerLocalOpts = Option(getenv("SPARK_JAVA_OPTS")) + .map(Utils.splitCommandString).getOrElse(Nil) val userOpts = getEnv("SPARK_JAVA_OPTS", command).map(Utils.splitCommandString).getOrElse(Nil) val memoryOpts = Seq(s"-Xms${memory}M", s"-Xmx${memory}M") diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala index 6f6c101547c3c..a26e47950a0ec 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala @@ -45,4 +45,4 @@ object DriverWrapper { System.exit(-1) } } -} \ No newline at end of file +} diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala index 1dc39c450ea16..530c147000904 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala @@ -69,4 +69,4 @@ private[spark] class WorkerWatcher(workerUrl: String) extends Actor case e => logWarning(s"Received unexpected actor system event: $e") } -} \ No newline at end of file +} diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/IndexPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/IndexPage.scala index 925c6fb1832d7..3089acffb8d98 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/IndexPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/IndexPage.scala @@ -84,7 +84,7 @@ private[spark] class IndexPage(parent: WorkerWebUI) { {runningExecutorTable} - + // scalastyle:off
{if (hasDrivers)
@@ -113,7 +113,7 @@ private[spark] class IndexPage(parent: WorkerWebUI) {
}
; - + // scalastyle:on UIUtils.basicSparkPage(content, "Spark Worker at %s:%s".format( workerState.host, workerState.port)) } @@ -133,10 +133,10 @@ private[spark] class IndexPage(parent: WorkerWebUI) { - stdout - stderr + stdout + stderr diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala index c23b75d757456..86688e44242a9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala @@ -187,7 +187,7 @@ class WorkerWebUI(val worker: Worker, val workDir: File, requestedPort: Option[I val logPageLength = math.min(byteLength, maxBytes) - val endByte = math.min(startByte+logPageLength, logLength) + val endByte = math.min(startByte + logPageLength, logLength) (startByte, endByte) } diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index f7efd74e1b043..989d666f15600 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -205,7 +205,7 @@ private[spark] class Executor( } attemptedTask = Some(task) - logDebug("Task " + taskId +"'s epoch is " + task.epoch) + logDebug("Task " + taskId + "'s epoch is " + task.epoch) env.mapOutputTracker.updateEpoch(task.epoch) // Run the actual task and measure its runtime. @@ -233,7 +233,8 @@ private[spark] class Executor( val accumUpdates = Accumulators.values - val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.getOrElse(null)) + val directResult = new DirectTaskResult(valueBytes, accumUpdates, + task.metrics.getOrElse(null)) val serializedDirectResult = ser.serialize(directResult) logInfo("Serialized size of result for " + taskId + " is " + serializedDirectResult.limit) val serializedResult = { diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala index e5c9bbbe2874e..210f3dbeebaca 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala @@ -50,10 +50,11 @@ object ExecutorExitCode { "Failed to create local directory (bad spark.local.dir?)" case _ => "Unknown executor exit code (" + exitCode + ")" + ( - if (exitCode > 128) + if (exitCode > 128) { " (died from signal " + (exitCode - 128) + "?)" - else + } else { "" + } ) } } diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala index 97176e4f5b727..c2e973e1738d4 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala @@ -55,7 +55,8 @@ class ExecutorSource(val executor: Executor, executorId: String) extends Source override def getValue: Int = executor.threadPool.getPoolSize() }) - // Gauge got executor thread pool's largest number of threads that have ever simultaneously been in th pool + // Gauge got executor thread pool's largest number of threads that have ever simultaneously + // been in th pool metricRegistry.register(MetricRegistry.name("threadpool", "maxPool_size"), new Gauge[Int] { override def getValue: Int = executor.threadPool.getMaximumPoolSize() }) diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index 0c8f4662a5f3a..455339943f42d 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -64,7 +64,8 @@ class TaskMetrics extends Serializable { var shuffleReadMetrics: Option[ShuffleReadMetrics] = None /** - * If this task writes to shuffle output, metrics on the written shuffle data will be collected here + * If this task writes to shuffle output, metrics on the written shuffle data will be collected + * here */ var shuffleWriteMetrics: Option[ShuffleWriteMetrics] = None } diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala index 9930537b34db0..de233e416a9dc 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala @@ -56,7 +56,8 @@ import org.apache.spark.metrics.source.Source * wild card "*" can be used to replace instance name, which means all the instances will have * this property. * - * [sink|source] means this property belongs to source or sink. This field can only be source or sink. + * [sink|source] means this property belongs to source or sink. This field can only be + * source or sink. * * [name] specify the name of sink or source, it is custom defined. * diff --git a/core/src/main/scala/org/apache/spark/network/Connection.scala b/core/src/main/scala/org/apache/spark/network/Connection.scala index cba8477ed5723..ae2007e41b77f 100644 --- a/core/src/main/scala/org/apache/spark/network/Connection.scala +++ b/core/src/main/scala/org/apache/spark/network/Connection.scala @@ -211,7 +211,6 @@ class SendingConnection(val address: InetSocketAddress, selector_ : Selector, } return chunk } else { - /*logInfo("Finished sending [" + message + "] to [" + getRemoteConnectionManagerId() + "]")*/ message.finishTime = System.currentTimeMillis logDebug("Finished sending [" + message + "] to [" + getRemoteConnectionManagerId() + "] in " + message.timeTaken ) @@ -238,7 +237,7 @@ class SendingConnection(val address: InetSocketAddress, selector_ : Selector, message.startTime = System.currentTimeMillis } logTrace( - "Sending chunk from [" + message+ "] to [" + getRemoteConnectionManagerId() + "]") + "Sending chunk from [" + message + "] to [" + getRemoteConnectionManagerId() + "]") return chunk } else { message.finishTime = System.currentTimeMillis @@ -349,8 +348,8 @@ class SendingConnection(val address: InetSocketAddress, selector_ : Selector, outbox.getChunk() match { case Some(chunk) => { val buffers = chunk.buffers - // If we have 'seen' pending messages, then reset flag - since we handle that as normal - // registering of event (below) + // If we have 'seen' pending messages, then reset flag - since we handle that as + // normal registering of event (below) if (needForceReregister && buffers.exists(_.remaining() > 0)) resetForceReregister() currentBuffers ++= buffers } @@ -404,7 +403,8 @@ class SendingConnection(val address: InetSocketAddress, selector_ : Selector, } } catch { case e: Exception => - logError("Exception while reading SendingConnection to " + getRemoteConnectionManagerId(), e) + logError("Exception while reading SendingConnection to " + getRemoteConnectionManagerId(), + e) callOnExceptionCallback(e) close() } diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala index e6e01783c8895..24d0a7deb57d0 100644 --- a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala +++ b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala @@ -65,7 +65,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi conf.getInt("spark.core.connection.io.threads.keepalive", 60), TimeUnit.SECONDS, new LinkedBlockingDeque[Runnable]()) - // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks : which should be executed asap + // Use a different, yet smaller, thread pool - infrequently used with very short lived tasks : + // which should be executed asap private val handleConnectExecutor = new ThreadPoolExecutor( conf.getInt("spark.core.connection.connect.threads.min", 1), conf.getInt("spark.core.connection.connect.threads.max", 8), @@ -73,8 +74,10 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi new LinkedBlockingDeque[Runnable]()) private val serverChannel = ServerSocketChannel.open() - private val connectionsByKey = new HashMap[SelectionKey, Connection] with SynchronizedMap[SelectionKey, Connection] - private val connectionsById = new HashMap[ConnectionManagerId, SendingConnection] with SynchronizedMap[ConnectionManagerId, SendingConnection] + private val connectionsByKey = new HashMap[SelectionKey, Connection] + with SynchronizedMap[SelectionKey, Connection] + private val connectionsById = new HashMap[ConnectionManagerId, SendingConnection] + with SynchronizedMap[ConnectionManagerId, SendingConnection] private val messageStatuses = new HashMap[Int, MessageStatus] private val keyInterestChangeRequests = new SynchronizedQueue[(SelectionKey, Int)] private val registerRequests = new SynchronizedQueue[SendingConnection] @@ -173,7 +176,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi if (conn == null) return // prevent other events from being triggered - // Since we are still trying to connect, we do not need to do the additional steps in triggerWrite + // Since we are still trying to connect, we do not need to do the additional steps in + // triggerWrite conn.changeConnectionKeyInterest(0) handleConnectExecutor.execute(new Runnable { @@ -188,8 +192,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi } // fallback to previous behavior : we should not really come here since this method was - // triggered since channel became connectable : but at times, the first finishConnect need not - // succeed : hence the loop to retry a few 'times'. + // triggered since channel became connectable : but at times, the first finishConnect need + // not succeed : hence the loop to retry a few 'times'. conn.finishConnect(true) } } ) @@ -258,8 +262,9 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi if (opStrs.size > 0) opStrs.reduceLeft(_ + " | " + _) else " " } - logTrace("Changed key for connection to [" + connection.getRemoteConnectionManagerId() + - "] changed from [" + intToOpStr(lastOps) + "] to [" + intToOpStr(ops) + "]") + logTrace("Changed key for connection to [" + + connection.getRemoteConnectionManagerId() + "] changed from [" + + intToOpStr(lastOps) + "] to [" + intToOpStr(ops) + "]") } } } else { @@ -282,7 +287,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi try { selector.select() } catch { - // Explicitly only dealing with CancelledKeyException here since other exceptions should be dealt with differently. + // Explicitly only dealing with CancelledKeyException here since other exceptions + // should be dealt with differently. case e: CancelledKeyException => { // Some keys within the selectors list are invalid/closed. clear them. val allKeys = selector.keys().iterator() @@ -310,7 +316,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi } if (selectedKeysCount == 0) { - logDebug("Selector selected " + selectedKeysCount + " of " + selector.keys.size + " keys") + logDebug("Selector selected " + selectedKeysCount + " of " + selector.keys.size + + " keys") } if (selectorThread.isInterrupted) { logInfo("Selector thread was interrupted!") @@ -341,7 +348,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi throw new CancelledKeyException() } } catch { - // weird, but we saw this happening - even though key.isValid was true, key.isAcceptable would throw CancelledKeyException. + // weird, but we saw this happening - even though key.isValid was true, + // key.isAcceptable would throw CancelledKeyException. case e: CancelledKeyException => { logInfo("key already cancelled ? " + key, e) triggerForceCloseByException(key, e) @@ -437,9 +445,10 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi assert (sendingConnectionManagerId == remoteConnectionManagerId) messageStatuses.synchronized { - for (s <- messageStatuses.values if s.connectionManagerId == sendingConnectionManagerId) { - logInfo("Notifying " + s) - s.synchronized { + for (s <- messageStatuses.values if + s.connectionManagerId == sendingConnectionManagerId) { + logInfo("Notifying " + s) + s.synchronized { s.attempted = true s.acked = false s.markDone() @@ -458,7 +467,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi } def handleConnectionError(connection: Connection, e: Exception) { - logInfo("Handling connection error on connection to " + connection.getRemoteConnectionManagerId()) + logInfo("Handling connection error on connection to " + + connection.getRemoteConnectionManagerId()) removeConnection(connection) } @@ -495,7 +505,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi status } case None => { - throw new Exception("Could not find reference for received ack message " + message.id) + throw new Exception("Could not find reference for received ack message " + + message.id) null } } @@ -517,7 +528,8 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi if (ackMessage.isDefined) { if (!ackMessage.get.isInstanceOf[BufferMessage]) { - logDebug("Response to " + bufferMessage + " is not a buffer message, it is of type " + ackMessage.get.getClass()) + logDebug("Response to " + bufferMessage + " is not a buffer message, it is of type " + + ackMessage.get.getClass()) } else if (!ackMessage.get.asInstanceOf[BufferMessage].hasAckId) { logDebug("Response to " + bufferMessage + " does not have ack id set") ackMessage.get.asInstanceOf[BufferMessage].ackId = bufferMessage.id @@ -535,14 +547,16 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi private def sendMessage(connectionManagerId: ConnectionManagerId, message: Message) { def startNewConnection(): SendingConnection = { - val inetSocketAddress = new InetSocketAddress(connectionManagerId.host, connectionManagerId.port) + val inetSocketAddress = new InetSocketAddress(connectionManagerId.host, + connectionManagerId.port) val newConnection = new SendingConnection(inetSocketAddress, selector, connectionManagerId) registerRequests.enqueue(newConnection) newConnection } - // I removed the lookupKey stuff as part of merge ... should I re-add it ? We did not find it useful in our test-env ... - // If we do re-add it, we should consistently use it everywhere I guess ? + // I removed the lookupKey stuff as part of merge ... should I re-add it ? We did not find it + // useful in our test-env ... If we do re-add it, we should consistently use it everywhere I + // guess ? val connection = connectionsById.getOrElseUpdate(connectionManagerId, startNewConnection()) message.senderAddress = id.toSocketAddress() logDebug("Sending [" + message + "] to [" + connectionManagerId + "]") @@ -558,15 +572,17 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi def sendMessageReliably(connectionManagerId: ConnectionManagerId, message: Message) : Future[Option[Message]] = { val promise = Promise[Option[Message]] - val status = new MessageStatus(message, connectionManagerId, s => promise.success(s.ackMessage)) - messageStatuses.synchronized { + val status = new MessageStatus( + message, connectionManagerId, s => promise.success(s.ackMessage)) + messageStatuses.synchronized { messageStatuses += ((message.id, status)) } sendMessage(connectionManagerId, message) promise.future } - def sendMessageReliablySync(connectionManagerId: ConnectionManagerId, message: Message): Option[Message] = { + def sendMessageReliablySync(connectionManagerId: ConnectionManagerId, + message: Message): Option[Message] = { Await.result(sendMessageReliably(connectionManagerId, message), Duration.Inf) } @@ -656,7 +672,8 @@ private[spark] object ConnectionManager { val tput = mb * 1000.0 / ms println("--------------------------") println("Started at " + startTime + ", finished at " + finishTime) - println("Sent " + count + " messages of size " + size + " in " + ms + " ms (" + tput + " MB/s)") + println("Sent " + count + " messages of size " + size + " in " + ms + " ms " + + "(" + tput + " MB/s)") println("--------------------------") println() } @@ -667,7 +684,8 @@ private[spark] object ConnectionManager { println("--------------------------") val size = 10 * 1024 * 1024 val count = 10 - val buffers = Array.tabulate(count)(i => ByteBuffer.allocate(size * (i + 1)).put(Array.tabulate[Byte](size * (i + 1))(x => x.toByte))) + val buffers = Array.tabulate(count)(i => ByteBuffer.allocate(size * (i + 1)).put( + Array.tabulate[Byte](size * (i + 1))(x => x.toByte))) buffers.foreach(_.flip) val mb = buffers.map(_.remaining).reduceLeft(_ + _) / 1024.0 / 1024.0 diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManagerTest.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManagerTest.scala index 4f5742d29b367..820045aa21813 100644 --- a/core/src/main/scala/org/apache/spark/network/ConnectionManagerTest.scala +++ b/core/src/main/scala/org/apache/spark/network/ConnectionManagerTest.scala @@ -30,14 +30,14 @@ import scala.concurrent.duration._ private[spark] object ConnectionManagerTest extends Logging{ def main(args: Array[String]) { - // - the master URL - // - a list slaves to run connectionTest on - //[num of tasks] - the number of parallel tasks to be initiated default is number of slave hosts - //[size of msg in MB (integer)] - the size of messages to be sent in each task, default is 10 - //[count] - how many times to run, default is 3 - //[await time in seconds] : await time (in seconds), default is 600 + // - the master URL - a list slaves to run connectionTest on + // [num of tasks] - the number of parallel tasks to be initiated default is number of slave + // hosts [size of msg in MB (integer)] - the size of messages to be sent in each task, + // default is 10 [count] - how many times to run, default is 3 [await time in seconds] : + // await time (in seconds), default is 600 if (args.length < 2) { - println("Usage: ConnectionManagerTest [num of tasks] [size of msg in MB (integer)] [count] [await time in seconds)] ") + println("Usage: ConnectionManagerTest [num of tasks] " + + "[size of msg in MB (integer)] [count] [await time in seconds)] ") System.exit(1) } @@ -56,7 +56,8 @@ private[spark] object ConnectionManagerTest extends Logging{ val size = ( if (args.length > 3) (args(3).toInt) else 10 ) * 1024 * 1024 val count = if (args.length > 4) args(4).toInt else 3 val awaitTime = (if (args.length > 5) args(5).toInt else 600 ).second - println("Running "+count+" rounds of test: " + "parallel tasks = " + tasknum + ", msg size = " + size/1024/1024 + " MB, awaitTime = " + awaitTime) + println("Running " + count + " rounds of test: " + "parallel tasks = " + tasknum + ", " + + "msg size = " + size/1024/1024 + " MB, awaitTime = " + awaitTime) val slaveConnManagerIds = sc.parallelize(0 until tasknum, tasknum).map( i => SparkEnv.get.connectionManager.id).collect() println("\nSlave ConnectionManagerIds") @@ -76,7 +77,8 @@ private[spark] object ConnectionManagerTest extends Logging{ buffer.flip val startTime = System.currentTimeMillis - val futures = slaveConnManagerIds.filter(_ != thisConnManagerId).map(slaveConnManagerId => { + val futures = slaveConnManagerIds.filter(_ != thisConnManagerId).map(slaveConnManagerId => + { val bufferMessage = Message.createBufferMessage(buffer.duplicate) logInfo("Sending [" + bufferMessage + "] to [" + slaveConnManagerId + "]") connManager.sendMessageReliably(slaveConnManagerId, bufferMessage) @@ -87,7 +89,8 @@ private[spark] object ConnectionManagerTest extends Logging{ val mb = size * results.size / 1024.0 / 1024.0 val ms = finishTime - startTime - val resultStr = thisConnManagerId + " Sent " + mb + " MB in " + ms + " ms at " + (mb / ms * 1000.0) + " MB/s" + val resultStr = thisConnManagerId + " Sent " + mb + " MB in " + ms + " ms at " + (mb / ms * + 1000.0) + " MB/s" logInfo(resultStr) resultStr }).collect() diff --git a/core/src/main/scala/org/apache/spark/network/SenderTest.scala b/core/src/main/scala/org/apache/spark/network/SenderTest.scala index dcbd183c88d09..9e03956ba0df9 100644 --- a/core/src/main/scala/org/apache/spark/network/SenderTest.scala +++ b/core/src/main/scala/org/apache/spark/network/SenderTest.scala @@ -52,17 +52,20 @@ private[spark] object SenderTest { val dataMessage = Message.createBufferMessage(buffer.duplicate) val startTime = System.currentTimeMillis /*println("Started timer at " + startTime)*/ - val responseStr = manager.sendMessageReliablySync(targetConnectionManagerId, dataMessage) match { - case Some(response) => - val buffer = response.asInstanceOf[BufferMessage].buffers(0) - new String(buffer.array) - case None => "none" - } + val responseStr = + manager.sendMessageReliablySync(targetConnectionManagerId, dataMessage) match { + case Some(response) => + val buffer = response.asInstanceOf[BufferMessage].buffers(0) + new String(buffer.array) + case None => "none" + } val finishTime = System.currentTimeMillis val mb = size / 1024.0 / 1024.0 val ms = finishTime - startTime - /*val resultStr = "Sent " + mb + " MB " + targetServer + " in " + ms + " ms at " + (mb / ms * 1000.0) + " MB/s"*/ - val resultStr = "Sent " + mb + " MB " + targetServer + " in " + ms + " ms (" + (mb / ms * 1000.0).toInt + "MB/s) | Response = " + responseStr + // val resultStr = "Sent " + mb + " MB " + targetServer + " in " + ms + " ms at " + (mb / ms + // * 1000.0) + " MB/s" + val resultStr = "Sent " + mb + " MB " + targetServer + " in " + ms + " ms (" + (mb / ms * + 1000.0).toInt + "MB/s) | Response = " + responseStr println(resultStr) }) } diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala index 30e578dd93e8d..8f9d1d5a84c36 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala @@ -60,7 +60,8 @@ class CheckpointRDD[T: ClassTag](sc: SparkContext, val checkpointPath: String) checkpointData.get.cpFile = Some(checkpointPath) override def getPreferredLocations(split: Partition): Seq[String] = { - val status = fs.getFileStatus(new Path(checkpointPath, CheckpointRDD.splitIdToFile(split.index))) + val status = fs.getFileStatus(new Path(checkpointPath, + CheckpointRDD.splitIdToFile(split.index))) val locations = fs.getFileBlockLocations(status, 0, status.getLen) locations.headOption.toList.flatMap(_.getHosts).filter(_ != "localhost") } diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala index cefcc3d2d9420..42e1ef8375284 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala @@ -197,8 +197,9 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc // return the next preferredLocation of some partition of the RDD def next(): (String, Partition) = { - if (it.hasNext) + if (it.hasNext) { it.next() + } else { it = resetIterator() // ran out of preferred locations, reset and rotate to the beginning it.next() @@ -290,8 +291,10 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc val r1 = rnd.nextInt(groupArr.size) val r2 = rnd.nextInt(groupArr.size) val minPowerOfTwo = if (groupArr(r1).size < groupArr(r2).size) groupArr(r1) else groupArr(r2) - if (prefPart== None) // if no preferred locations, just use basic power of two - return minPowerOfTwo + if (prefPart == None) { + // if no preferred locations, just use basic power of two + return minPowerOfTwo + } val prefPartActual = prefPart.get diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index 688c310ee9caf..20713b4249b5e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -37,8 +37,8 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { } /** - * Return a [[org.apache.spark.util.StatCounter]] object that captures the mean, variance and count - * of the RDD's elements in one operation. + * Return a [[org.apache.spark.util.StatCounter]] object that captures the mean, variance and + * count of the RDD's elements in one operation. */ def stats(): StatCounter = { self.mapPartitions(nums => Iterator(StatCounter(nums))).reduce((a, b) => a.merge(b)) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 370061492da91..10d519e6971f1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -705,7 +705,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) } logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " + - valueClass.getSimpleName+ ")") + valueClass.getSimpleName + ")") val writer = new SparkHadoopWriter(conf) writer.preSetup() diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala index 09d0a8189d25c..56c7777600a6a 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala @@ -39,7 +39,8 @@ private[spark] class ParallelCollectionPartition[T: ClassTag]( override def hashCode(): Int = (41 * (41 + rddId) + slice).toInt override def equals(other: Any): Boolean = other match { - case that: ParallelCollectionPartition[_] => (this.rddId == that.rddId && this.slice == that.slice) + case that: ParallelCollectionPartition[_] => (this.rddId == that.rddId && + this.slice == that.slice) case _ => false } diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala index 4c625d062eb9b..f4364329a3a71 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala @@ -23,8 +23,8 @@ import org.apache.spark.{TaskContext, OneToOneDependency, SparkContext, Partitio /** - * Class representing partitions of PartitionerAwareUnionRDD, which maintains the list of corresponding partitions - * of parent RDDs. + * Class representing partitions of PartitionerAwareUnionRDD, which maintains the list of + * corresponding partitions of parent RDDs. */ private[spark] class PartitionerAwareUnionRDDPartition( diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala index bc688110f4736..73e8769c0981d 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala @@ -35,10 +35,10 @@ private[spark] object CheckpointState extends Enumeration { } /** - * This class contains all the information related to RDD checkpointing. Each instance of this class - * is associated with a RDD. It manages process of checkpointing of the associated RDD, as well as, - * manages the post-checkpoint state by providing the updated partitions, iterator and preferred locations - * of the checkpointed RDD. + * This class contains all the information related to RDD checkpointing. Each instance of this + * class is associated with a RDD. It manages process of checkpointing of the associated RDD, + * as well as, manages the post-checkpoint state by providing the updated partitions, + * iterator and preferred locations of the checkpointed RDD. */ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) extends Logging with Serializable { @@ -97,7 +97,7 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) val newRDD = new CheckpointRDD[T](rdd.context, path.toString) if (newRDD.partitions.size != rdd.partitions.size) { throw new SparkException( - "Checkpoint RDD " + newRDD + "("+ newRDD.partitions.size + ") has different " + + "Checkpoint RDD " + newRDD + "(" + newRDD.partitions.size + ") has different " + "number of partitions than original RDD " + rdd + "(" + rdd.partitions.size + ")") } diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala index 2d1bd5b4813c9..c9b4c768a98b4 100644 --- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala @@ -71,7 +71,8 @@ class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag val convertKey = !classOf[Writable].isAssignableFrom(self.getKeyClass) val convertValue = !classOf[Writable].isAssignableFrom(self.getValueClass) - logInfo("Saving as sequence file of type (" + keyClass.getSimpleName + "," + valueClass.getSimpleName + ")" ) + logInfo("Saving as sequence file of type (" + keyClass.getSimpleName + "," + + valueClass.getSimpleName + ")" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 21d16fabefaa5..80211541a6a63 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1082,8 +1082,9 @@ class DAGScheduler( case n: NarrowDependency[_] => for (inPart <- n.getParents(partition)) { val locs = getPreferredLocs(n.rdd, inPart) - if (locs != Nil) + if (locs != Nil) { return locs + } } case _ => } diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala index cc10cc0849bc7..23447f1bbf852 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala @@ -33,7 +33,7 @@ import scala.collection.JavaConversions._ * Parses and holds information about inputFormat (and files) specified as a parameter. */ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Class[_], - val path: String) extends Logging { + val path: String) extends Logging { var mapreduceInputFormat: Boolean = false var mapredInputFormat: Boolean = false @@ -41,7 +41,8 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl validate() override def toString: String = { - "InputFormatInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", path : " + path + "InputFormatInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", " + + "path : " + path } override def hashCode(): Int = { @@ -50,8 +51,8 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl hashCode } - // Since we are not doing canonicalization of path, this can be wrong : like relative vs absolute path - // .. which is fine, this is best case effort to remove duplicates - right ? + // Since we are not doing canonicalization of path, this can be wrong : like relative vs + // absolute path .. which is fine, this is best case effort to remove duplicates - right ? override def equals(other: Any): Boolean = other match { case that: InputFormatInfo => { // not checking config - that should be fine, right ? @@ -65,22 +66,26 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl logDebug("validate InputFormatInfo : " + inputFormatClazz + ", path " + path) try { - if (classOf[org.apache.hadoop.mapreduce.InputFormat[_, _]].isAssignableFrom(inputFormatClazz)) { + if (classOf[org.apache.hadoop.mapreduce.InputFormat[_, _]].isAssignableFrom( + inputFormatClazz)) { logDebug("inputformat is from mapreduce package") mapreduceInputFormat = true } - else if (classOf[org.apache.hadoop.mapred.InputFormat[_, _]].isAssignableFrom(inputFormatClazz)) { + else if (classOf[org.apache.hadoop.mapred.InputFormat[_, _]].isAssignableFrom( + inputFormatClazz)) { logDebug("inputformat is from mapred package") mapredInputFormat = true } else { throw new IllegalArgumentException("Specified inputformat " + inputFormatClazz + - " is NOT a supported input format ? does not implement either of the supported hadoop api's") + " is NOT a supported input format ? does not implement either of the supported hadoop " + + "api's") } } catch { case e: ClassNotFoundException => { - throw new IllegalArgumentException("Specified inputformat " + inputFormatClazz + " cannot be found ?", e) + throw new IllegalArgumentException("Specified inputformat " + inputFormatClazz + + " cannot be found ?", e) } } } @@ -125,8 +130,8 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl } private def findPreferredLocations(): Set[SplitInfo] = { - logDebug("mapreduceInputFormat : " + mapreduceInputFormat + ", mapredInputFormat : " + mapredInputFormat + - ", inputFormatClazz : " + inputFormatClazz) + logDebug("mapreduceInputFormat : " + mapreduceInputFormat + ", mapredInputFormat : " + + mapredInputFormat + ", inputFormatClazz : " + inputFormatClazz) if (mapreduceInputFormat) { prefLocsFromMapreduceInputFormat() } @@ -150,8 +155,8 @@ object InputFormatInfo { c) Compute rack info for each host and update rack -> count map based on (b). d) Allocate nodes based on (c) e) On the allocation result, ensure that we dont allocate "too many" jobs on a single node - (even if data locality on that is very high) : this is to prevent fragility of job if a single - (or small set of) hosts go down. + (even if data locality on that is very high) : this is to prevent fragility of job if a + single (or small set of) hosts go down. go to (a) until required nodes are allocated. @@ -159,7 +164,8 @@ object InputFormatInfo { PS: I know the wording here is weird, hopefully it makes some sense ! */ - def computePreferredLocations(formats: Seq[InputFormatInfo]): HashMap[String, HashSet[SplitInfo]] = { + def computePreferredLocations(formats: Seq[InputFormatInfo]): HashMap[String, HashSet[SplitInfo]] + = { val nodeToSplit = new HashMap[String, HashSet[SplitInfo]] for (inputSplit <- formats) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala index f8fa5a9f7a590..b909b66a5de76 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala @@ -45,10 +45,11 @@ class JobLogger(val user: String, val logDirName: String) String.valueOf(System.currentTimeMillis())) private val logDir = - if (System.getenv("SPARK_LOG_DIR") != null) + if (System.getenv("SPARK_LOG_DIR") != null) { System.getenv("SPARK_LOG_DIR") - else + } else { "/tmp/spark-%s".format(user) + } private val jobIDToPrintWriter = new HashMap[Int, PrintWriter] private val stageIDToJobID = new HashMap[Int, Int] @@ -116,7 +117,7 @@ class JobLogger(val user: String, val logDirName: String) var writeInfo = info if (withTime) { val date = new Date(System.currentTimeMillis()) - writeInfo = DATE_FORMAT.format(date) + ": " +info + writeInfo = DATE_FORMAT.format(date) + ": " + info } jobIDToPrintWriter.get(jobID).foreach(_.println(writeInfo)) } @@ -235,7 +236,8 @@ class JobLogger(val user: String, val logDirName: String) * @param stage Root stage of the job * @param indent Indent number before info, default is 0 */ - protected def recordStageDepGraph(jobID: Int, stage: Stage, idSet: HashSet[Int], indent: Int = 0) { + protected def recordStageDepGraph(jobID: Int, stage: Stage, idSet: HashSet[Int], indent: Int = 0) + { val stageInfo = if (stage.isShuffleMap) { "STAGE_ID=" + stage.id + " MAP_STAGE SHUFFLE_ID=" + stage.shuffleDep.get.shuffleId } else { diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobResult.scala b/core/src/main/scala/org/apache/spark/scheduler/JobResult.scala index c381348a8d424..d94f6ad924260 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobResult.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobResult.scala @@ -23,4 +23,5 @@ package org.apache.spark.scheduler private[spark] sealed trait JobResult private[spark] case object JobSucceeded extends JobResult -private[spark] case class JobFailed(exception: Exception, failedStage: Option[Stage]) extends JobResult +private[spark] case class JobFailed(exception: Exception, failedStage: Option[Stage]) + extends JobResult diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala index 28f3ba53b8425..0544f81f1ce86 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala @@ -36,7 +36,8 @@ private[spark] object ResultTask { val metadataCleaner = new MetadataCleaner( MetadataCleanerType.RESULT_TASK, serializedInfoCache.clearOldValues, new SparkConf) - def serializeInfo(stageId: Int, rdd: RDD[_], func: (TaskContext, Iterator[_]) => _): Array[Byte] = { + def serializeInfo(stageId: Int, rdd: RDD[_], func: (TaskContext, Iterator[_]) => _) + : Array[Byte] = { synchronized { val old = serializedInfoCache.get(stageId).orNull if (old != null) { @@ -55,7 +56,8 @@ private[spark] object ResultTask { } } - def deserializeInfo(stageId: Int, bytes: Array[Byte]): (RDD[_], (TaskContext, Iterator[_]) => _) = { + def deserializeInfo(stageId: Int, bytes: Array[Byte]) + : (RDD[_], (TaskContext, Iterator[_]) => _) = { val loader = Thread.currentThread.getContextClassLoader val in = new GZIPInputStream(new ByteArrayInputStream(bytes)) val ser = SparkEnv.get.closureSerializer.newInstance() diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala index 3cf995ea74244..a546193d5b49a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala @@ -148,6 +148,6 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) } } parentPool.addSchedulable(manager) - logInfo("Added task set " + manager.name + " tasks to pool "+poolName) + logInfo("Added task set " + manager.name + " tasks to pool " + poolName) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala index d8e97c3b7c7b0..d25f0a63547e6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala @@ -37,8 +37,8 @@ case class SparkListenerTaskGettingResult( case class SparkListenerTaskEnd(task: Task[_], reason: TaskEndReason, taskInfo: TaskInfo, taskMetrics: TaskMetrics) extends SparkListenerEvents -case class SparkListenerJobStart(job: ActiveJob, stageIds: Array[Int], properties: Properties = null) - extends SparkListenerEvents +case class SparkListenerJobStart(job: ActiveJob, stageIds: Array[Int], + properties: Properties = null) extends SparkListenerEvents case class SparkListenerJobEnd(job: ActiveJob, jobResult: JobResult) extends SparkListenerEvents @@ -99,11 +99,14 @@ class StatsReportListener extends SparkListener with Logging { showMillisDistribution("task runtime:", (info, _) => Some(info.duration)) //shuffle write - showBytesDistribution("shuffle bytes written:",(_,metric) => metric.shuffleWriteMetrics.map{_.shuffleBytesWritten}) + showBytesDistribution("shuffle bytes written:", + (_,metric) => metric.shuffleWriteMetrics.map{_.shuffleBytesWritten}) //fetch & io - showMillisDistribution("fetch wait time:",(_, metric) => metric.shuffleReadMetrics.map{_.fetchWaitTime}) - showBytesDistribution("remote bytes read:", (_, metric) => metric.shuffleReadMetrics.map{_.remoteBytesRead}) + showMillisDistribution("fetch wait time:", + (_, metric) => metric.shuffleReadMetrics.map{_.fetchWaitTime}) + showBytesDistribution("remote bytes read:", + (_, metric) => metric.shuffleReadMetrics.map{_.remoteBytesRead}) showBytesDistribution("task result size:", (_, metric) => Some(metric.resultSize)) //runtime breakdown @@ -111,8 +114,10 @@ class StatsReportListener extends SparkListener with Logging { val runtimePcts = stageCompleted.stage.taskInfos.map{ case (info, metrics) => RuntimePercentage(info.duration, metrics) } - showDistribution("executor (non-fetch) time pct: ", Distribution(runtimePcts.map{_.executorPct * 100}), "%2.0f %%") - showDistribution("fetch wait time pct: ", Distribution(runtimePcts.flatMap{_.fetchPct.map{_ * 100}}), "%2.0f %%") + showDistribution("executor (non-fetch) time pct: ", + Distribution(runtimePcts.map{_.executorPct * 100}), "%2.0f %%") + showDistribution("fetch wait time pct: ", + Distribution(runtimePcts.flatMap{_.fetchPct.map{_ * 100}}), "%2.0f %%") showDistribution("other time pct: ", Distribution(runtimePcts.map{_.other * 100}), "%2.0f %%") } @@ -147,7 +152,8 @@ private[spark] object StatsReportListener extends Logging { logInfo("\t" + quantiles.mkString("\t")) } - def showDistribution(heading: String, dOpt: Option[Distribution], formatNumber: Double => String) { + def showDistribution(heading: String, + dOpt: Option[Distribution], formatNumber: Double => String) { dOpt.foreach { d => showDistribution(heading, d, formatNumber)} } @@ -156,7 +162,8 @@ private[spark] object StatsReportListener extends Logging { showDistribution(heading, dOpt, f _) } - def showDistribution(heading:String, format: String, getMetric: (TaskInfo,TaskMetrics) => Option[Double]) + def showDistribution(heading:String, format: String, + getMetric: (TaskInfo,TaskMetrics) => Option[Double]) (implicit stage: SparkListenerStageCompleted) { showDistribution(heading, extractDoubleDistribution(stage, getMetric), format) } @@ -175,7 +182,8 @@ private[spark] object StatsReportListener extends Logging { } def showMillisDistribution(heading: String, dOpt: Option[Distribution]) { - showDistribution(heading, dOpt, (d => StatsReportListener.millisToString(d.toLong)): Double => String) + showDistribution(heading, dOpt, + (d => StatsReportListener.millisToString(d.toLong)): Double => String) } def showMillisDistribution(heading: String, getMetric: (TaskInfo, TaskMetrics) => Option[Long]) @@ -212,7 +220,7 @@ private object RuntimePercentage { val denom = totalTime.toDouble val fetchTime = metrics.shuffleReadMetrics.map{_.fetchWaitTime} val fetch = fetchTime.map{_ / denom} - val exec = (metrics.executorRunTime - fetchTime.getOrElse(0l)) / denom + val exec = (metrics.executorRunTime - fetchTime.getOrElse(0L)) / denom val other = 1.0 - (exec + fetch.getOrElse(0d)) RuntimePercentage(exec, fetch, other) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala index 520c0b29e3536..a78b0186b9eab 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala @@ -63,8 +63,9 @@ private[spark] class Stage( def addOutputLoc(partition: Int, status: MapStatus) { val prevList = outputLocs(partition) outputLocs(partition) = status :: prevList - if (prevList == Nil) + if (prevList == Nil) { numAvailableOutputs += 1 + } } def removeOutputLoc(partition: Int, bmAddress: BlockManagerId) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala index c4d1ad5733b4c..8f320e5c7a74b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala @@ -29,7 +29,8 @@ import org.apache.spark.executor.TaskMetrics */ class StageInfo( stage: Stage, - val taskInfos: mutable.Buffer[(TaskInfo, TaskMetrics)] = mutable.Buffer[(TaskInfo, TaskMetrics)]() + val taskInfos: mutable.Buffer[(TaskInfo, TaskMetrics)] = + mutable.Buffer[(TaskInfo, TaskMetrics)]() ) { val stageId = stage.id /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */ diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala index 3c22edd5248f4..91c27d7b8e9d7 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala @@ -70,16 +70,17 @@ class TaskInfo( def running: Boolean = !finished def status: String = { - if (running) + if (running) { "RUNNING" - else if (gettingResult) + } else if (gettingResult) { "GET RESULT" - else if (failed) + } else if (failed) { "FAILED" - else if (successful) + } else if (successful) { "SUCCESS" - else + } else { "UNKNOWN" + } } def duration: Long = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala index 9d3e6158266b8..5724ec9d1b4d7 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala @@ -35,7 +35,8 @@ case class IndirectTaskResult[T](blockId: BlockId) extends TaskResult[T] with Se /** A TaskResult that contains the task's return value and accumulator updates. */ private[spark] -class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long, Any], var metrics: TaskMetrics) +class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long, Any], + var metrics: TaskMetrics) extends TaskResult[T] with Externalizable { def this() = this(null.asInstanceOf[ByteBuffer], null, null) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index 17b6d97e90e0a..1cdfed1d7005e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -36,7 +36,8 @@ private[spark] trait TaskScheduler { def start(): Unit // Invoked after system has successfully initialized (typically in spark context). - // Yarn uses this to bootstrap allocation of resources based on preferred locations, wait for slave registerations, etc. + // Yarn uses this to bootstrap allocation of resources based on preferred locations, + // wait for slave registerations, etc. def postStartHook() { } // Disconnect from the cluster. diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 3f0ee7a6d48cb..21b2ff1682b78 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -80,7 +80,7 @@ private[spark] class TaskSetManager( var minShare = 0 var priority = taskSet.priority var stageId = taskSet.stageId - var name = "TaskSet_"+taskSet.stageId.toString + var name = "TaskSet_" + taskSet.stageId.toString var parent: Pool = null val runningTasksSet = new HashSet[Long] diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 0208388e86680..78204103a9bbd 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -120,7 +120,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A sender ! true case DisassociatedEvent(_, address, _) => - addressToExecutorId.get(address).foreach(removeExecutor(_, "remote Akka client disassociated")) + addressToExecutorId.get(address).foreach(removeExecutor(_, + "remote Akka client disassociated")) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index 33aac52051bfc..04f35cca08262 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -51,8 +51,8 @@ private[spark] class SparkDeploySchedulerBackend( val command = Command( "org.apache.spark.executor.CoarseGrainedExecutorBackend", args, sc.executorEnvs) val sparkHome = sc.getSparkHome() - val appDesc = new ApplicationDescription(appName, maxCores, sc.executorMemory, command, sparkHome, - "http://" + sc.ui.appUIAddress) + val appDesc = new ApplicationDescription(appName, maxCores, sc.executorMemory, command, + sparkHome, "http://" + sc.ui.appUIAddress) client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf) client.start() @@ -84,7 +84,8 @@ private[spark] class SparkDeploySchedulerBackend( } } - override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, memory: Int) { + override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, + memory: Int) { logInfo("Granted executor ID %s on hostPort %s with %d cores, %s RAM".format( fullId, hostPort, cores, Utils.megabytesToString(memory))) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index c27049bdb5208..4401f6df47421 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -136,7 +136,8 @@ private[spark] class CoarseMesosSchedulerBackend( // glob the directory "correctly". val basename = uri.split('/').last.split('.').head command.setValue( - "cd %s*; ./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend %s %s %s %d" + ("cd %s*; " + + "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend %s %s %s %d") .format(basename, driverUrl, offer.getSlaveId.getValue, offer.getHostname, numCores)) command.addUris(CommandInfo.URI.newBuilder().setValue(uri)) } diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index c14cd47556987..2d0b25538505c 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -60,7 +60,8 @@ class KryoSerializer(conf: SparkConf) extends org.apache.spark.serializer.Serial try { for (regCls <- conf.getOption("spark.kryo.registrator")) { logDebug("Running user registrator: " + regCls) - val reg = Class.forName(regCls, true, classLoader).newInstance().asInstanceOf[KryoRegistrator] + val reg = Class.forName(regCls, true, classLoader).newInstance() + .asInstanceOf[KryoRegistrator] reg.registerClasses(kryo) } } catch { diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala index 9a5e3cb77e1d5..a38a2b59dbc23 100644 --- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala @@ -27,11 +27,12 @@ import org.apache.spark.util.{NextIterator, ByteBufferInputStream} /** * A serializer. Because some serialization libraries are not thread safe, this class is used to - * create [[org.apache.spark.serializer.SerializerInstance]] objects that do the actual serialization and are - * guaranteed to only be called from one thread at a time. + * create [[org.apache.spark.serializer.SerializerInstance]] objects that do the actual + * serialization and are guaranteed to only be called from one thread at a time. * * Implementations of this trait should have a zero-arg constructor or a constructor that accepts a - * [[org.apache.spark.SparkConf]] as parameter. If both constructors are defined, the latter takes precedence. + * [[org.apache.spark.SparkConf]] as parameter. If both constructors are defined, the latter takes + * precedence. */ trait Serializer { def newInstance(): SerializerInstance diff --git a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala index 4fa2ab96d9725..aa62ab5aba1c2 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala @@ -76,9 +76,9 @@ object BlockFetcherIterator { import blockManager._ - private var _remoteBytesRead = 0l - private var _remoteFetchTime = 0l - private var _fetchWaitTime = 0l + private var _remoteBytesRead = 0L + private var _remoteFetchTime = 0L + private var _fetchWaitTime = 0L if (blocksByAddress == null) { throw new IllegalArgumentException("BlocksByAddress is null") diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index ed53558566edf..542deb98c1304 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -206,8 +206,9 @@ private[spark] class BlockManager( * message reflecting the current status, *not* the desired storage level in its block info. * For example, a block with MEMORY_AND_DISK set might have fallen out to be only on disk. * - * droppedMemorySize exists to account for when block is dropped from memory to disk (so it is still valid). - * This ensures that update in master will compensate for the increase in memory on slave. + * droppedMemorySize exists to account for when block is dropped from memory to disk (so it + * is still valid). This ensures that update in master will compensate for the increase in + * memory on slave. */ def reportBlockStatus(blockId: BlockId, info: BlockInfo, droppedMemorySize: Long = 0L) { val needReregister = !tryToReportBlockStatus(blockId, info, droppedMemorySize) @@ -224,7 +225,8 @@ private[spark] class BlockManager( * which will be true if the block was successfully recorded and false if * the slave needs to re-register. */ - private def tryToReportBlockStatus(blockId: BlockId, info: BlockInfo, droppedMemorySize: Long = 0L): Boolean = { + private def tryToReportBlockStatus(blockId: BlockId, info: BlockInfo, + droppedMemorySize: Long = 0L): Boolean = { val (curLevel, inMemSize, onDiskSize, tellMaster) = info.synchronized { info.level match { case null => @@ -282,14 +284,15 @@ private[spark] class BlockManager( // As an optimization for map output fetches, if the block is for a shuffle, return it // without acquiring a lock; the disk store never deletes (recent) items so this should work if (blockId.isShuffle) { - return diskStore.getBytes(blockId) match { + diskStore.getBytes(blockId) match { case Some(bytes) => Some(bytes) case None => throw new Exception("Block " + blockId + " not found on disk, though it should be") } + } else { + doGetLocal(blockId, asValues = false).asInstanceOf[Option[ByteBuffer]] } - doGetLocal(blockId, asValues = false).asInstanceOf[Option[ByteBuffer]] } private def doGetLocal(blockId: BlockId, asValues: Boolean): Option[Any] = { @@ -701,7 +704,8 @@ private[spark] class BlockManager( diskStore.putBytes(blockId, bytes, level) } } - val droppedMemorySize = if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L + val droppedMemorySize = + if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L val blockWasRemoved = memoryStore.remove(blockId) if (!blockWasRemoved) { logWarning("Block " + blockId + " could not be dropped from memory as it does not exist") diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala index 2c1a4e2f5d3a1..893418fb8cad9 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala @@ -61,8 +61,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act override def preStart() { if (!BlockManager.getDisableHeartBeatsForTesting(conf)) { import context.dispatcher - timeoutCheckingTask = context.system.scheduler.schedule( - 0.seconds, checkTimeoutInterval.milliseconds, self, ExpireDeadHosts) + timeoutCheckingTask = context.system.scheduler.schedule(0.seconds, + checkTimeoutInterval.milliseconds, self, ExpireDeadHosts) } super.preStart() } @@ -169,8 +169,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act val toRemove = new mutable.HashSet[BlockManagerId] for (info <- blockManagerInfo.values) { if (info.lastSeenMs < minSeenTime) { - logWarning("Removing BlockManager " + info.blockManagerId + " with no recent heart beats: " + - (now - info.lastSeenMs) + "ms exceeds " + slaveTimeout + "ms") + logWarning("Removing BlockManager " + info.blockManagerId + " with no recent heart beats: " + + (now - info.lastSeenMs) + "ms exceeds " + slaveTimeout + "ms") toRemove += info.blockManagerId } } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala index 365866d1e3397..7cf754fb204c2 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSource.scala @@ -57,9 +57,9 @@ private[spark] class BlockManagerSource(val blockManager: BlockManager, sc: Spar override def getValue: Long = { val storageStatusList = blockManager.master.getStorageStatus val diskSpaceUsed = storageStatusList - .flatMap(_.blocks.values.map(_.diskSize)) - .reduceOption(_ + _) - .getOrElse(0L) + .flatMap(_.blocks.values.map(_.diskSize)) + .reduceOption(_ + _) + .getOrElse(0L) diskSpaceUsed / 1024 / 1024 } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockMessageArray.scala b/core/src/main/scala/org/apache/spark/storage/BlockMessageArray.scala index 59329361f320b..5ded9ab359820 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockMessageArray.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockMessageArray.scala @@ -25,7 +25,8 @@ import org.apache.spark._ import org.apache.spark.network._ private[spark] -class BlockMessageArray(var blockMessages: Seq[BlockMessage]) extends Seq[BlockMessage] with Logging { +class BlockMessageArray(var blockMessages: Seq[BlockMessage]) + extends Seq[BlockMessage] with Logging { def this(bm: BlockMessage) = this(Array(bm)) @@ -65,7 +66,8 @@ class BlockMessageArray(var blockMessages: Seq[BlockMessage]) extends Seq[BlockM buffer.position(buffer.position() + size) } val finishTime = System.currentTimeMillis - logDebug("Converted block message array from buffer message in " + (finishTime - startTime) / 1000.0 + " s") + logDebug("Converted block message array from buffer message in " + + (finishTime - startTime) / 1000.0 + " s") this.blockMessages = newBlockMessages } diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala index 1720007e4e70b..50a0cdb3095cd 100644 --- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala +++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala @@ -25,15 +25,15 @@ private[spark] case class StorageStatus(blockManagerId: BlockManagerId, maxMem: Long, blocks: Map[BlockId, BlockStatus]) { - def memUsed() = blocks.values.map(_.memSize).reduceOption(_+_).getOrElse(0L) + def memUsed() = blocks.values.map(_.memSize).reduceOption(_ + _).getOrElse(0L) def memUsedByRDD(rddId: Int) = - rddBlocks.filterKeys(_.rddId == rddId).values.map(_.memSize).reduceOption(_+_).getOrElse(0L) + rddBlocks.filterKeys(_.rddId == rddId).values.map(_.memSize).reduceOption(_ + _).getOrElse(0L) - def diskUsed() = blocks.values.map(_.diskSize).reduceOption(_+_).getOrElse(0L) + def diskUsed() = blocks.values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L) def diskUsedByRDD(rddId: Int) = - rddBlocks.filterKeys(_.rddId == rddId).values.map(_.diskSize).reduceOption(_+_).getOrElse(0L) + rddBlocks.filterKeys(_.rddId == rddId).values.map(_.diskSize).reduceOption(_ + _).getOrElse(0L) def memRemaining : Long = maxMem - memUsed() @@ -48,8 +48,9 @@ case class RDDInfo(id: Int, name: String, storageLevel: StorageLevel, extends Ordered[RDDInfo] { override def toString = { import Utils.bytesToString - "RDD \"%s\" (%d) Storage: %s; CachedPartitions: %d; TotalPartitions: %d; MemorySize: %s; DiskSize: %s".format(name, id, - storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(diskSize)) + ("RDD \"%s\" (%d) Storage: %s; CachedPartitions: %d; TotalPartitions: %d; MemorySize: %s; " + + "DiskSize: %s").format(name, id, storageLevel.toString, numCachedPartitions, + numPartitions, bytesToString(memSize), bytesToString(diskSize)) } override def compare(that: RDDInfo) = { @@ -64,7 +65,8 @@ object StorageUtils { /* Returns RDD-level information, compiled from a list of StorageStatus objects */ def rddInfoFromStorageStatus(storageStatusList: Seq[StorageStatus], sc: SparkContext) : Array[RDDInfo] = { - rddInfoFromBlockStatusList(storageStatusList.flatMap(_.rddBlocks).toMap[RDDBlockId, BlockStatus], sc) + rddInfoFromBlockStatusList( + storageStatusList.flatMap(_.rddBlocks).toMap[RDDBlockId, BlockStatus], sc) } /* Returns a map of blocks to their locations, compiled from a list of StorageStatus objects */ @@ -91,7 +93,8 @@ object StorageUtils { sc.persistentRdds.get(rddId).map { r => val rddName = Option(r.name).getOrElse(rddId.toString) val rddStorageLevel = r.getStorageLevel - RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, r.partitions.size, memSize, diskSize) + RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, r.partitions.size, + memSize, diskSize) } }.flatten.toArray diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 5573b3847bcaf..b95c8f43b08f8 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -48,14 +48,16 @@ private[spark] object UIUtils { case _ =>
  • Environment
  • } val executors = page match { - case Executors =>
  • Executors
  • + case Executors =>
  • Executors +
  • case _ =>
  • Executors
  • } - + {sc.appName} - {title} @@ -63,7 +65,8 @@ private[spark] object UIUtils {