Skip to content

Commit

Permalink
[SPARK-11439][ML] Optimization of creating sparse feature without den…
Browse files Browse the repository at this point in the history
…se one

Sparse feature generated in LinearDataGenerator does not create dense vectors as an intermediate any more.

Author: Nakul Jindal <[email protected]>

Closes #9756 from nakul02/SPARK-11439_sparse_without_creating_dense_feature.
  • Loading branch information
Nakul Jindal authored and srowen committed Dec 8, 2015
1 parent 7081291 commit 037b7e7
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 122 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.{BLAS, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

Expand Down Expand Up @@ -131,35 +131,27 @@ object LinearDataGenerator {
eps: Double,
sparsity: Double): Seq[LabeledPoint] = {
require(0.0 <= sparsity && sparsity <= 1.0)
val rnd = new Random(seed)
val x = Array.fill[Array[Double]](nPoints)(
Array.fill[Double](weights.length)(rnd.nextDouble()))

val sparseRnd = new Random(seed)
x.foreach { v =>
var i = 0
val len = v.length
while (i < len) {
if (sparseRnd.nextDouble() < sparsity) {
v(i) = 0.0
} else {
v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
}
i += 1
}
}

val y = x.map { xi =>
blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
}
val rnd = new Random(seed)
def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}

y.zip(x).map { p =>
if (sparsity == 0.0) {
if (sparsity == 0.0) {
(0 until nPoints).map { _ =>
val features = Vectors.dense(weights.indices.map { rndElement(_) }.toArray)
val label = BLAS.dot(Vectors.dense(weights), features) +
intercept + eps * rnd.nextGaussian()
// Return LabeledPoints with DenseVector
LabeledPoint(p._1, Vectors.dense(p._2))
} else {
LabeledPoint(label, features)
}
} else {
(0 until nPoints).map { _ =>
val indices = weights.indices.filter { _ => rnd.nextDouble() <= sparsity}
val values = indices.map { rndElement(_) }
val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
val label = BLAS.dot(Vectors.dense(weights), features) +
intercept + eps * rnd.nextGaussian()
// Return LabeledPoints with SparseVector
LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
LabeledPoint(label, features)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ class RegressionEvaluatorSuite

// default = rmse
val evaluator = new RegressionEvaluator()
assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

// r2 score
evaluator.setMetricName("r2")
assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

// mae
evaluator.setMetricName("mae")
assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
}

test("read/write") {
Expand Down
Loading

0 comments on commit 037b7e7

Please sign in to comment.