apache · nakul02 · Nov 17, 2015 · Nov 24, 2015 · Nov 24, 2015 · Nov 24, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{BLAS, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -131,39 +131,36 @@ object LinearDataGenerator {
       eps: Double,
       sparsity: Double): Seq[LabeledPoint] = {
     require(0.0 <= sparsity && sparsity <= 1.0)
-    val rnd = new Random(seed)
-    val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble()))
-
-    val sparseRnd = new Random(seed)
-    x.foreach { v =>
-      var i = 0
-      val len = v.length
-      while (i < len) {
-        if (sparseRnd.nextDouble() < sparsity) {
-          v(i) = 0.0
-        } else {
-          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        }
-        i += 1
-      }
-    }
-
-    val y = x.map { xi =>
-      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
-    }
 
-    y.zip(x).map { p =>
-      if (sparsity == 0.0) {
+    val rnd = new Random(seed)
+    val rndG = new Random(seed)
+    if (sparsity <= 0.0) {
+      (0 until nPoints).map { _ =>
+        val features = Vectors.dense((0 until weights.length).map { i =>
+          (rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rndG.nextGaussian()
         // Return LabeledPoints with DenseVector
-        LabeledPoint(p._1, Vectors.dense(p._2))
-      } else {
+        LabeledPoint(label, features)
+      }
+    } else {
+      val sparseRnd = new Random(seed)
+      (0 until nPoints).map { _ =>
+        val (values, indices) = (0 until weights.length).filter { _ =>
+          sparseRnd.nextDouble() >= sparsity }.map { i =>
+          ((rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i), i)
+        }.unzip
+        val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rndG.nextGaussian()
         // Return LabeledPoints with SparseVector
-        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+        LabeledPoint(label, features)
       }
     }
   }
 
+
   /**
    * Generate an RDD containing sample data for Linear Regression models - including Ridge, Lasso,
    * and uregularized variants.

diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -63,14 +63,14 @@ class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext
 
     // default = rmse
     val evaluator = new RegressionEvaluator()
-    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.09849278 absTol 0.001)
 
     // r2 score
     evaluator.setMetricName("r2")
-    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.9998313 absTol 0.001)
 
     // mae
     evaluator.setMetricName("mae")
-    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.07893991 absTol 0.001)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -41,6 +41,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
      is the same as the one trained by R's glmnet package. The following instruction
      describes how to reproduce the data in R.
+     In a spark-shell, use the following code:
 
      import org.apache.spark.mllib.util.LinearDataGenerator
      val data =
@@ -591,21 +592,44 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       }
 
       /*
-         Use the following R code to generate model training results.
-
-         predictions <- predict(fit, newx=features)
-         residuals <- label - predictions
-         > mean(residuals^2) # MSE
-         [1] 0.009720325
-         > mean(abs(residuals)) # MAD
-         [1] 0.07863206
-         > cor(predictions, label)^2# r^2
-                 [,1]
-         s0 0.9998749
+         # Use the following R code to generate model training results.
+
+         # path/part-00000 is the file generated by running LinearDataGenerator.generateLinearInput
+         # as described before the beforeAll() method.
+         d1 <- read.csv("path/part-00000", header=FALSE, stringsAsFactors=FALSE)
+         fit <- glm(V1 ~ V2 + V3, data = d1, family = "gaussian")
+         f1 <- data.frame(as.numeric(d1$V2), as.numeric(d1$V3))
+         predictions <- predict(fit, newdata=f1)
+         l1 <- as.numeric(d1$V1)
+
+         residuals <- l1 - predictions
+         > mean(residuals^2)           # MSE
+         [1] 0.009925853
+         > mean(abs(residuals))        # MAD
+         [1] 00.07975305
+         > cor(predictions, label)^2   # r^2
+         [1] 0.9998722
+
+         > summary(fit)
+
+          Call:
+          glm(formula = V1 ~ V2 + V3, family = "gaussian", data = d1)
+
+          Deviance Residuals:
+               Min        1Q    Median        3Q       Max
+          -0.38568  -0.06770   0.00022   0.06724   0.39246
+
+          Coefficients:
+                       Estimate Std. Error t value Pr(>|t|)
+          (Intercept) 6.2988232  0.0018684    3371   <2e-16 ***
+          V2          4.7015664  0.0011880    3958   <2e-16 ***
+          V3          7.1995670  0.0009127    7888   <2e-16 ***
+          ---
+          ....
        */
-      assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
-      assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
-      assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+      assert(model.summary.meanSquaredError ~== 0.009925853 relTol 1E-5)
+      assert(model.summary.meanAbsoluteError ~== 0.07975305 relTol 1E-5)
+      assert(model.summary.r2 ~== 0.9998722 relTol 1E-5)
 
       // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
       // objective history because it does not run through iterations.
@@ -620,9 +644,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         // To clalify that the normal solver is used here.
         assert(model.summary.objectiveHistory.length == 1)
         assert(model.summary.objectiveHistory(0) == 0.0)
-        val devianceResidualsR = Array(-0.35566, 0.34504)
-        val seCoefR = Array(0.0011756, 0.0009032, 0.0018489)
-        val tValsR = Array(3998, 7971, 3407)
+        val devianceResidualsR = Array(-0.38568, 0.39246)
+        val seCoefR = Array(0.0011880, 0.0009127, 0.0018684)
+        val tValsR = Array(3958, 7888, 3371)
         val pValsR = Array(0, 0, 0)
         model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
           assert(x._1 ~== x._2 absTol 1E-5) }