[SPARK-11439][ML] Optimization of creating sparse feature without den…

…se one Sparse feature generated in LinearDataGenerator does not create dense vectors as an intermediate any more. Author: Nakul Jindal <[email protected]> Closes #9756 from nakul02/SPARK-11439_sparse_without_creating_dense_feature.
apache · Dec 8, 2015 · 037b7e7 · 037b7e7
1 parent 7081291
commit 037b7e7
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 122 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{BLAS, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -131,35 +131,27 @@ object LinearDataGenerator {
       eps: Double,
       sparsity: Double): Seq[LabeledPoint] = {
     require(0.0 <= sparsity && sparsity <= 1.0)
-    val rnd = new Random(seed)
-    val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble()))
-
-    val sparseRnd = new Random(seed)
-    x.foreach { v =>
-      var i = 0
-      val len = v.length
-      while (i < len) {
-        if (sparseRnd.nextDouble() < sparsity) {
-          v(i) = 0.0
-        } else {
-          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        }
-        i += 1
-      }
-    }
 
-    val y = x.map { xi =>
-      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
-    }
+    val rnd = new Random(seed)
+    def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}
 
-    y.zip(x).map { p =>
-      if (sparsity == 0.0) {
+    if (sparsity == 0.0) {
+      (0 until nPoints).map { _ =>
+        val features = Vectors.dense(weights.indices.map { rndElement(_) }.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with DenseVector
-        LabeledPoint(p._1, Vectors.dense(p._2))
-      } else {
+        LabeledPoint(label, features)
+      }
+    } else {
+      (0 until nPoints).map { _ =>
+        val indices = weights.indices.filter { _ => rnd.nextDouble() <= sparsity}
+        val values = indices.map { rndElement(_) }
+        val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with SparseVector
-        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+        LabeledPoint(label, features)
       }
     }
   }

diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -65,15 +65,15 @@ class RegressionEvaluatorSuite
 
     // default = rmse
     val evaluator = new RegressionEvaluator()
-    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)
 
     // r2 score
     evaluator.setMetricName("r2")
-    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)
 
     // mae
     evaluator.setMetricName("mae")
-    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
   }
 
   test("read/write") {