diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala index 49419f7654e67..3515461b52493 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala @@ -47,7 +47,7 @@ private[mllib] object EigenValueDecomposition { * function. */ private[mllib] def symmetricEigs( - mul: DenseVector => DenseVector, + mul: BDV[Double] => BDV[Double], n: Int, k: Int, tol: Double, @@ -102,9 +102,9 @@ private[mllib] object EigenValueDecomposition { // multiply working vector with the matrix val inputOffset = ipntr(0) - 1 val outputOffset = ipntr(1) - 1 - val x = w(inputOffset until inputOffset + n) - val y = w(outputOffset until outputOffset + n) - y := BDV(mul(Vectors.fromBreeze(x).asInstanceOf[DenseVector]).toArray) + val x = w.slice(inputOffset, inputOffset + n) + val y = w.slice(outputOffset, outputOffset + n) + y := mul(x) // call ARPACK's reverse communication arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd, workl, workl.length, info) @@ -143,13 +143,12 @@ private[mllib] object EigenValueDecomposition { // copy eigenvectors in descending order of eigenvalues val sortedU = BDM.zeros[Double](n, computed) - sortedEigenPairs.zipWithIndex.foreach { r => { - val b = r._2 * n - var i = 0 - while (i < n) { - sortedU.data(b + i) = r._1._2(i) - i += 1 - } + sortedEigenPairs.zipWithIndex.foreach { r => + val b = r._2 * n + var i = 0 + while (i < n) { + sortedU.data(b + i) = r._1._2(i) + i += 1 } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index cd9d28a4f976c..29c0adc51fe2b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.linalg.distributed -import java.util +import java.util.Arrays import breeze.linalg.{Vector => BV, DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV} import breeze.linalg.{svd => brzSvd, axpy => brzAxpy} @@ -207,9 +207,9 @@ class RowMatrix( * @param v a local DenseVector whose length must match the number of columns of this matrix. * @return a local DenseVector representing the product. */ - private[mllib] def multiplyGramianMatrixBy(v: DenseVector): DenseVector = { + private[mllib] def multiplyGramianMatrixBy(v: BDV[Double]): BDV[Double] = { val n = numCols().toInt - val vbr = rows.context.broadcast(v.toBreeze) + val vbr = rows.context.broadcast(v) val bv = rows.aggregate(BDV.zeros[Double](n))( seqOp = (U, r) => { @@ -227,7 +227,7 @@ class RowMatrix( combOp = (U1, U2) => U1 += U2 ) - Vectors.fromBreeze(bv).asInstanceOf[DenseVector] + bv } /** @@ -250,49 +250,51 @@ class RowMatrix( } /** - * Computes singular value decomposition of this matrix. Denote this matrix by A (m x n), this + * Computes singular value decomposition of this matrix. Denote this matrix by A (m x n). This * will compute matrices U, S, V such that A ~= U * S * V', where S contains the leading k * singular values, U and V contain the corresponding singular vectors. * - * This approach assumes n is smaller than m, and invokes a dense matrix implementation when n is - * small (n < 100) or the number of requested singular values is the same as n (k == n). For - * problems with large n (n >= 100) and k < n, this approach invokes a sparse matrix - * implementation that provides a function to ARPACK to multiply a vector with A'A. It iteratively - * calls ARPACK-dsaupd on the master node, from which we recover S and V. Then we compute U via - * easy matrix multiplication as U = A * (V * S^{-1}). + * At most k largest non-zero singular values and associated vectors are returned. If there are k + * such values, then the dimensions of the return will be: + * - U is a RowMatrix of size m x k that satisfies U' * U = eye(k), + * - s is a Vector of size k, holding the singular values in descending order, + * - V is a Matrix of size n x k that satisfies V' * V = eye(k). * - * The dense implementation requires `n^2` doubles to fit in memory and `O(n^3)` time on the - * master node. - * - * The sparse implementation requires `n * (6 * k + 4)` doubles to fit in memory on the master - * node and approximately `O(k * nnz(A))` time distributed on all worker nodes. There is no - * restriction on m (number of rows). + * We assume n is smaller than m. The singular values and the right singular vectors are derived + * from the eigenvalues and the eigenvectors of the Gramian matrix A' * A. U, the matrix + * storing the right singular vectors, is computed via matrix multiplication as + * U = A * (V * S^{-1}), if requested by user. The actual method to use is determined + * automatically based on the cost: + * - If n is small (n < 100) or k is large compared with n (k > n / 2), we compute the Gramian + * matrix first and then compute its top eigenvalues and eigenvectors locally on the driver. + * This requires a single pass with O(n^2) storage on each executor and on the driver, and + * O(n^2 k) time on the driver. + * - Otherwise, we compute (A' * A) * v in a distributive way and send it to ARPACK's DSAUPD to + * compute (A' * A)'s top eigenvalues and eigenvectors on the driver node. This requires O(k) + * passes, O(n) storage on each executor, and O(n k) storage on the driver. * * Several internal parameters are set to default values. The reciprocal condition number rCond * is set to 1e-9. All singular values smaller than rCond * sigma(0) are treated as zeros, where * sigma(0) is the largest singular value. The maximum number of Arnoldi update iterations for - * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK + * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's * eigen-decomposition is set to 1e-10. * - * At most k largest non-zero singular values and associated vectors are returned. - * If there are k such values, then the dimensions of the return will be: - * - * U is a RowMatrix of size m x k that satisfies U'U = eye(k), - * s is a Vector of size k, holding the singular values in descending order, - * and V is a Matrix of size n x k that satisfies V'V = eye(k). + * @note The conditions that decide which method to use internally and the default parameters are + * subject to change. * - * @param k number of leading singular values to keep (0 < k <= n). It might return less than - * k if there are numerically zero singular values or there are not enough Ritz values + * @param k number of leading singular values to keep (0 < k <= n). It might return less than k if + * there are numerically zero singular values or there are not enough Ritz values * converged before the maximum number of Arnoldi update iterations is reached (in case * that matrix A is ill-conditioned). - * @param computeU whether to compute U. + * @param computeU whether to compute U * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0) * are treated as zero, where sigma(0) is the largest singular value. - * @return SingularValueDecomposition(U, s, V), U = null if computeU = false + * @return SingularValueDecomposition(U, s, V). U = null if computeU = false. */ - def computeSVD(k: Int, - computeU: Boolean = false, - rCond: Double = 1e-9): SingularValueDecomposition[RowMatrix, Matrix] = { + def computeSVD( + k: Int, + computeU: Boolean = false, + rCond: Double = 1e-9): SingularValueDecomposition[RowMatrix, Matrix] = { // maximum number of Arnoldi update iterations for invoking ARPACK val maxIter = math.max(300, k * 3) // numerical tolerance for invoking ARPACK @@ -301,87 +303,78 @@ class RowMatrix( } /** - * Actual SVD computation, visible for testing. + * The actual SVD implementation, visible for testing. + * + * @param k number of leading singular values to keep (0 < k <= n) + * @param computeU whether to compute U + * @param rCond the reciprocal condition number + * @param maxIter max number of iterations (if ARPACK is used) + * @param tol termination tolerance (if ARPACK is used) + * @param mode computation mode (auto: determine automatically which mode to use, + * local-svd: compute gram matrix and computes its full SVD locally, + * local-eigs: compute gram matrix and computes its top eigenvalues locally, + * dist-eigs: compute the top eigenvalues of the gram matrix distributively) + * @return SingularValueDecomposition(U, s, V). U = null if computeU = false. */ - private[mllib] def computeSVD(k: Int, - computeU: Boolean, - rCond: Double, - maxIter: Int, - tol: Double, - mode: String): SingularValueDecomposition[RowMatrix, Matrix] = { + private[mllib] def computeSVD( + k: Int, + computeU: Boolean, + rCond: Double, + maxIter: Int, + tol: Double, + mode: String): SingularValueDecomposition[RowMatrix, Matrix] = { val n = numCols().toInt + require(k > 0 && k <= n, s"Request up to n singular values but got k=$k and n=$n.") object SVDMode extends Enumeration { - val DenseARPACK, DenseLAPACK, SparseARPACK = Value + val LocalARPACK, LocalLAPACK, DistARPACK = Value } - val derivedMode = mode match { - case "auto" => if (n < 100 || k == n) { - // invoke dense implementation when n is small or k == n (since ARPACK requires k < n) - require(k > 0 && k <= n, s"Request up to n singular values k=$k n=$n.") - "dense" - } else { - // invoke sparse implementation with ARPACK when n is large - require(k > 0 && k < n, s"Request up to n - 1 singular values for ARPACK k=$k n=$n.") - "sparse" - } - case "dense" => "dense" - case "sparse" => "sparse" + val computeMode = mode match { + case "auto" => + // TODO: The conditions below are not fully tested. + if (n < 100 || k > n / 2) { + // If n is small or k is large compared with n, we better compute the Gramian matrix first + // and then compute its eigenvalues locally, instead of making multiple passes. + if (k < n / 3) { + SVDMode.LocalARPACK + } else { + SVDMode.LocalLAPACK + } + } else { + // If k is small compared with n, we use ARPACK with distributed multiplication. + SVDMode.DistARPACK + } + case "local-svd" => SVDMode.LocalLAPACK + case "local-eigs" => SVDMode.LocalARPACK + case "dist-eigs" => SVDMode.DistARPACK case _ => throw new IllegalArgumentException(s"Do not support mode $mode.") } - val computeMode = derivedMode match { - case "dense" => if (k < n / 2) { - // when k is small, call ARPACK - require(k > 0 && k < n, s"Request up to n - 1 singular values for ARPACK k=$k n=$n.") - SVDMode.DenseARPACK - } else { - // when k is large, call LAPACK - SVDMode.DenseLAPACK - } - case "sparse" => SVDMode.SparseARPACK - } - + // Compute the eigen-decomposition of A' * A. val (sigmaSquares: BDV[Double], u: BDM[Double]) = computeMode match { - case SVDMode.DenseARPACK => { + case SVDMode.LocalARPACK => + require(k < n, s"k must be smaller than n in local-eigs mode but got k=$k and n=$n.") val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] - def multiplyDenseGramianMatrixBy(v: DenseVector): DenseVector = { - Vectors.fromBreeze(G * v.toBreeze).asInstanceOf[DenseVector] - } - EigenValueDecomposition.symmetricEigs(multiplyDenseGramianMatrixBy, n, k, tol, maxIter) - } - case SVDMode.DenseLAPACK => { + EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter) + case SVDMode.LocalLAPACK => val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] - val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], vFull: BDM[Double]) = brzSvd(G) + val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G) (sigmaSquaresFull, uFull) - } - case SVDMode.SparseARPACK => { + case SVDMode.DistARPACK => + require(k < n, s"k must be smaller than n in dist-eigs mode but got k=$k and n=$n.") EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter) - } } - computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquares, u) - } - - /** - * Determine effective rank of SVD result and compute left singular vectors if required. - */ - private def computeSVDEffectiveRank( - k: Int, - n: Int, - computeU: Boolean, - rCond: Double, - sigmaSquares: BDV[Double], - u: BDM[Double]): SingularValueDecomposition[RowMatrix, Matrix] = { val sigmas: BDV[Double] = brzSqrt(sigmaSquares) - // Determine effective rank. + // Determine the effective rank. val sigma0 = sigmas(0) val threshold = rCond * sigma0 var i = 0 - // sigmas might have a length smaller than k, if some Ritz values do not satisfy the - // convergence criterion specified by tol after maxIterations. - // Thus use i < min(k, sigmas.length) instead of i < k + // sigmas might have a length smaller than k, if some Ritz values do not satisfy the convergence + // criterion specified by tol after max number of iterations. + // Thus use i < min(k, sigmas.length) instead of i < k. if (sigmas.length < k) { logWarning(s"Requested $k singular values but only found ${sigmas.length} converged.") } @@ -394,12 +387,12 @@ class RowMatrix( logWarning(s"Requested $k singular values but only found $sk nonzeros.") } - val s = Vectors.dense(util.Arrays.copyOfRange(sigmas.data, 0, sk)) - val V = Matrices.dense(n, sk, util.Arrays.copyOfRange(u.data, 0, n * sk)) + val s = Vectors.dense(Arrays.copyOfRange(sigmas.data, 0, sk)) + val V = Matrices.dense(n, sk, Arrays.copyOfRange(u.data, 0, n * sk)) if (computeU) { // N = Vk * Sk^{-1} - val N = new BDM[Double](n, sk, util.Arrays.copyOfRange(u.data, 0, n * sk)) + val N = new BDM[Double](n, sk, Arrays.copyOfRange(u.data, 0, n * sk)) var i = 0 var j = 0 while (j < sk) { @@ -484,7 +477,7 @@ class RowMatrix( if (k == n) { Matrices.dense(n, k, u.data) } else { - Matrices.dense(n, k, util.Arrays.copyOfRange(u.data, 0, n * k)) + Matrices.dense(n, k, Arrays.copyOfRange(u.data, 0, n * k)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 3091d1f967cf5..a961f89456a18 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -95,51 +95,40 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { } test("svd of a full-rank matrix") { - for (denseSVD <- Seq(true, false)) { - for (mat <- Seq(denseMat, sparseMat)) { + for (mat <- Seq(denseMat, sparseMat)) { + for (mode <- Seq("auto", "local-svd", "local-eigs", "dist-eigs")) { val localMat = mat.toBreeze() val (localU, localSigma, localVt) = brzSvd(localMat) val localV: BDM[Double] = localVt.t.toDenseMatrix for (k <- 1 to n) { - val svd = if (k < n) { - if (denseSVD) { - mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "dense") - } else { - mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "sparse") - } - } else { - // when k = n, always use dense SVD - mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "dense") + val skip = (mode == "local-eigs" || mode == "dist-eigs") && k == n + if (!skip) { + val svd = mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode) + val U = svd.U + val s = svd.s + val V = svd.V + assert(U.numRows() === m) + assert(U.numCols() === k) + assert(s.size === k) + assert(V.numRows === n) + assert(V.numCols === k) + assertColumnEqualUpToSign(U.toBreeze(), localU, k) + assertColumnEqualUpToSign(V.toBreeze.asInstanceOf[BDM[Double]], localV, k) + assert(closeToZero(s.toBreeze.asInstanceOf[BDV[Double]] - localSigma(0 until k))) } - val U = svd.U - val s = svd.s - val V = svd.V - assert(U.numRows() === m) - assert(U.numCols() === k) - assert(s.size === k) - assert(V.numRows === n) - assert(V.numCols === k) - assertColumnEqualUpToSign(U.toBreeze(), localU, k) - assertColumnEqualUpToSign(V.toBreeze.asInstanceOf[BDM[Double]], localV, k) - assert(closeToZero(s.toBreeze.asInstanceOf[BDV[Double]] - localSigma(0 until k))) - } - val svdWithoutU = if (denseSVD) { - mat.computeSVD(n - 1, computeU = false, 1e-9, 300, 1e-10, mode = "dense") - } else { - mat.computeSVD(n - 1, computeU = false, 1e-9, 300, 1e-10, mode = "sparse") } + val svdWithoutU = mat.computeSVD(1, computeU = false, 1e-9, 300, 1e-10, mode) assert(svdWithoutU.U === null) } } } test("svd of a low-rank matrix") { - for (denseSVD <- Seq(true, false)) { - val rows = sc.parallelize(Array.fill(4)(Vectors.dense(1.0, 1.0, 1.0)), 2) - val mat = new RowMatrix(rows, 4, 3) - val svd = if (denseSVD) mat.computeSVD(2, computeU = true, 1e-9, 300, 1e-10, mode = "dense") - else mat.computeSVD(2, computeU = true, 1e-9, 300, 1e-10, mode = "sparse") - assert(svd.s.size === 1, "should not return zero singular values") + val rows = sc.parallelize(Array.fill(4)(Vectors.dense(1.0, 1.0, 1.0)), 2) + val mat = new RowMatrix(rows, 4, 3) + for (mode <- Seq("auto", "local-svd", "local-eigs", "dist-eigs")) { + val svd = mat.computeSVD(2, computeU = true, 1e-6, 300, 1e-10, mode) + assert(svd.s.size === 1, s"should not return zero singular values but got ${svd.s}") assert(svd.U.numRows() === 4) assert(svd.U.numCols() === 1) assert(svd.V.numRows === 3)