From 4c7aec3d1c5203b4825047c66bed718211f9446c Mon Sep 17 00:00:00 2001
From: Li Pu <lpu@twitter.com>
Date: Fri, 6 Jun 2014 18:33:47 -0700
Subject: [PATCH] improve comments

---
 .../linalg/EigenValueDecomposition.scala      | 34 ++++++++++++++++---
 .../mllib/linalg/distributed/RowMatrix.scala  |  8 +++--
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index f81a8b83cafb8..a571cc8a1c5b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -50,16 +50,26 @@ object EigenValueDecomposition {
 
     val arpack = ARPACK.getInstance()
 
+    // tolerance used in stopping criterion
     val tolW = new doubleW(tol)
+    // number of desired eigenvalues, 0 < nev < n
     val nev = new intW(k)
+    // nev Lanczos vectors are generated are generated in the first iteration
+    // ncv-nev Lanczos vectors are generated are generated in each subsequent iteration
+    // ncv must be smaller than n
     val ncv = scala.math.min(2 * k, n)
 
+    // "I" for standard eigenvalue problem, "G" for generalized eigenvalue problem
     val bmat = "I"
+    // "LM" : compute the NEV largest (in magnitude) eigenvalues
     val which = "LM"
 
     var iparam = new Array[Int](11)
+    // use exact shift in each iteration
     iparam(0) = 1
+    // maximum number of Arnoldi update iterations, or the actual number of iterations on output
     iparam(2) = 300
+    // Mode 1: A*x = lambda*x, A symmetric
     iparam(6) = 1
 
     var ido = new intW(0)
@@ -70,15 +80,17 @@ object EigenValueDecomposition {
     var workl = new Array[Double](ncv * (ncv + 8))
     var ipntr = new Array[Int](11)
 
-    // first call to ARPACK
+    // call ARPACK's reverse communication, first iteration with ido = 0
     arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr, workd,
       workl, workl.length, info)
 
     val w = BDV(workd)
 
-    while(ido.`val` != 99) {
+    // ido = 99 : done flag in reverse communication
+    while (ido.`val` != 99) {
       if (ido.`val` != -1 && ido.`val` != 1) {
-        throw new IllegalStateException("ARPACK returns ido = " + ido.`val`)
+        throw new IllegalStateException("ARPACK returns ido = " + ido.`val` +
+            " This flag is not compatible with Mode 1: A*x = lambda*x, A symmetric.")
       }
       // multiply working vector with the matrix
       val inputOffset = ipntr(0) - 1
@@ -86,28 +98,40 @@ object EigenValueDecomposition {
       val x = w(inputOffset until inputOffset + n)
       val y = w(outputOffset until outputOffset + n)
       y := BDV(mul(Vectors.fromBreeze(x).asInstanceOf[DenseVector]).toArray)
-      // call ARPACK
+      // call ARPACK's reverse communication
       arpack.dsaupd(ido, bmat, n, which, nev.`val`, tolW, resid, ncv, v, n, iparam, ipntr,
         workd, workl, workl.length, info)
     }
 
     if (info.`val` != 0) {
-      throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val`)
+      info.`val` match {
+        case 1 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " Maximum number of iterations taken. (Refer ARPACK user guide for details)")
+        case 2 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " No shifts could be applied. Try to increase NCV. " +
+            "(Refer ARPACK user guide for details)")
+        case _ => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+            " Please refer ARPACK user guide for error message.")
+      }
     }
 
     val d = new Array[Double](nev.`val`)
     val select = new Array[Boolean](ncv)
+    // copy the Ritz vectors
     val z = java.util.Arrays.copyOfRange(v, 0, nev.`val` * n)
 
+    // call ARPACK's post-processing for eigenvectors
     arpack.dseupd(true, "A", select, d, z, n, 0.0, bmat, n, which, nev, tol, resid, ncv, v, n,
       iparam, ipntr, workd, workl, workl.length, info)
 
+    // number of computed eigenvalues, might be smaller than k
     val computed = iparam(4)
 
     val eigenPairs = java.util.Arrays.copyOfRange(d, 0, computed).zipWithIndex.map{
       r => (r._1, java.util.Arrays.copyOfRange(z, r._2 * n, r._2 * n + n))
     }
 
+    // sort the eigen-pairs in descending order
     val sortedEigenPairs = eigenPairs.sortBy(-1 * _._1)
 
     // copy eigenvectors in descending order of eigenvalues
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index c24142f44ddcf..7e7d6e0d8ab6d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -259,8 +259,9 @@ class RowMatrix(
    * Then we compute U via easy matrix multiplication as U =  A * (V * S^{-1}).
    * Note that this approach requires `O(nnz(A))` time.
    *
-   * When the requested eigenvalues k = n, a non-sparse implementation will be used, which requires
-   * `n^2` doubles to fit in memory and `O(n^3)` time on the master node.
+   * ARPACK requires k to be strictly less than n. Thus when the requested eigenvalues k = n, a
+   * non-sparse implementation will be used, which requires `n^2` doubles to fit in memory and
+   * `O(n^3)` time on the master node.
    *
    * At most k largest non-zero singular values and associated vectors are returned.
    * If there are k such values, then the dimensions of the return will be:
@@ -274,7 +275,8 @@ class RowMatrix(
    * @param computeU whether to compute U
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
-   * @param tol the tolerance of the svd computation.
+   * @param tol the numerical tolerance of svd computation. Larger tolerance means fewer iterations,
+   *            but less accurate result.
    * @return SingularValueDecomposition(U, s, V)
    */
   def computeSVD(