From 4c7aec3d1c5203b4825047c66bed718211f9446c Mon Sep 17 00:00:00 2001 From: Li Pu Date: Fri, 6 Jun 2014 18:33:47 -0700 Subject: [PATCH] improve comments --- .../linalg/EigenValueDecomposition.scala | 34 ++++++++++++++++--- .../mllib/linalg/distributed/RowMatrix.scala | 8 +++-- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala index f81a8b83cafb8..a571cc8a1c5b6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala @@ -50,16 +50,26 @@ object EigenValueDecomposition { val arpack = ARPACK.getInstance() + // tolerance used in stopping criterion val tolW = new doubleW(tol) + // number of desired eigenvalues, 0 < nev < n val nev = new intW(k) + // nev Lanczos vectors are generated are generated in the first iteration + // ncv-nev Lanczos vectors are generated are generated in each subsequent iteration + // ncv must be smaller than n val ncv = scala.math.min(2 * k, n) + // "I" for standard eigenvalue problem, "G" for generalized eigenvalue problem val bmat = "I" + // "LM" : compute the NEV largest (in magnitude) eigenvalues val which = "LM" var iparam = new Array[Int](11) + // use exact shift in each iteration iparam(0) = 1 + // maximum number of Arnoldi update iterations, or the actual number of iterations on output iparam(2) = 300 + // Mode 1: A*x = lambda*x, A symmetric iparam(6) = 1 var ido = new intW(0) @@ -70,15 +80,17 @@ object EigenValueDecomposition { var workl = new Array[Double](ncv * (ncv + 8)) var ipntr = new Array[Int](11) - // first call to ARPACK + // call ARPACK's reverse communication, first iteration with ido = 0 arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd, workl, workl.length, info) val w = BDV(workd) - while(ido.`val` != 99) { + // ido = 99 : done flag in reverse communication + while (ido.`val` != 99) { if (ido.`val` != -1 && ido.`val` != 1) { - throw new IllegalStateException("ARPACK returns ido = " + ido.`val`) + throw new IllegalStateException("ARPACK returns ido = " + ido.`val` + + " This flag is not compatible with Mode 1: A*x = lambda*x, A symmetric.") } // multiply working vector with the matrix val inputOffset = ipntr(0) - 1 @@ -86,28 +98,40 @@ object EigenValueDecomposition { val x = w(inputOffset until inputOffset + n) val y = w(outputOffset until outputOffset + n) y := BDV(mul(Vectors.fromBreeze(x).asInstanceOf[DenseVector]).toArray) - // call ARPACK + // call ARPACK's reverse communication arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd, workl, workl.length, info) } if (info.`val` != 0) { - throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val`) + info.`val` match { + case 1 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` + + " Maximum number of iterations taken. (Refer ARPACK user guide for details)") + case 2 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` + + " No shifts could be applied. Try to increase NCV. " + + "(Refer ARPACK user guide for details)") + case _ => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` + + " Please refer ARPACK user guide for error message.") + } } val d = new Array[Double](nev.`val`) val select = new Array[Boolean](ncv) + // copy the Ritz vectors val z = java.util.Arrays.copyOfRange(v, 0, nev.`val` * n) + // call ARPACK's post-processing for eigenvectors arpack.dseupd(true, "A", select, d, z, n, 0.0, bmat, n, which, nev, tol, resid, ncv, v, n, iparam, ipntr, workd, workl, workl.length, info) + // number of computed eigenvalues, might be smaller than k val computed = iparam(4) val eigenPairs = java.util.Arrays.copyOfRange(d, 0, computed).zipWithIndex.map{ r => (r._1, java.util.Arrays.copyOfRange(z, r._2 * n, r._2 * n + n)) } + // sort the eigen-pairs in descending order val sortedEigenPairs = eigenPairs.sortBy(-1 * _._1) // copy eigenvectors in descending order of eigenvalues diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index c24142f44ddcf..7e7d6e0d8ab6d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -259,8 +259,9 @@ class RowMatrix( * Then we compute U via easy matrix multiplication as U = A * (V * S^{-1}). * Note that this approach requires `O(nnz(A))` time. * - * When the requested eigenvalues k = n, a non-sparse implementation will be used, which requires - * `n^2` doubles to fit in memory and `O(n^3)` time on the master node. + * ARPACK requires k to be strictly less than n. Thus when the requested eigenvalues k = n, a + * non-sparse implementation will be used, which requires `n^2` doubles to fit in memory and + * `O(n^3)` time on the master node. * * At most k largest non-zero singular values and associated vectors are returned. * If there are k such values, then the dimensions of the return will be: @@ -274,7 +275,8 @@ class RowMatrix( * @param computeU whether to compute U * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0) * are treated as zero, where sigma(0) is the largest singular value. - * @param tol the tolerance of the svd computation. + * @param tol the numerical tolerance of svd computation. Larger tolerance means fewer iterations, + * but less accurate result. * @return SingularValueDecomposition(U, s, V) */ def computeSVD(