From e60a34f3479ce3b642f5941497c3e5c1bbeebdd4 Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Fri, 30 Jan 2015 12:16:37 +0100 Subject: [PATCH] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519. Styling and comment fixes. --- .../mllib/regression/IsotonicRegression.scala | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 8ecedd74b93c1..9b67807b92f02 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -28,7 +28,9 @@ import org.apache.spark.rdd.RDD * Regression model for isotonic regression. * * @param boundaries Array of boundaries for which predictions are known. + * Boundaries must be sorted in increasing order. * @param predictions Array of predictions associated to the boundaries at the same index. + * Result of isotonic regression and therefore is monotone. */ class IsotonicRegressionModel ( boundaries: Array[Double], @@ -75,11 +77,11 @@ class IsotonicRegressionModel ( * * @param testData Feature to be labeled. * @return Predicted label. - * If testData exactly matches a boundary then associated prediction is directly returned - * If testData is lower or higher than all boundaries - * then first or last prediction is returned respectively - * If testData falls between two values in boundary then predictions is treated - * as piecewise linear function and interpolated value is returned + * If testData exactly matches a boundary then associated prediction is directly returned. + * If testData is lower or higher than all boundaries. + * then first or last prediction is returned respectively. + * If testData falls between two values in boundary array then predictions is treated + * as piecewise linear function and interpolated value is returned. */ def predict(testData: Double): Double = { @@ -87,25 +89,24 @@ class IsotonicRegressionModel ( y1 + (y2 - y1) * (x - x1) / (x2 - x1) } - val insertIndex = binarySearch(boundaries, testData) - - val normalisedInsertIndex = -insertIndex - 1 + val foundIndex = binarySearch(boundaries, testData) + val insertIndex = -foundIndex - 1 // Find if the index was lower than all values, - // higher than all values, inbetween two values or exact match. - if (insertIndex == -1) { + // higher than all values, in between two values or exact match. + if (insertIndex == 0) { predictions.head - } else if (normalisedInsertIndex == boundaries.length){ + } else if (insertIndex == boundaries.length){ predictions.last - } else if (insertIndex < 0) { + } else if (foundIndex < 0) { linearInterpolation( - boundaries(normalisedInsertIndex - 1), - predictions(normalisedInsertIndex - 1), - boundaries(normalisedInsertIndex), - predictions(normalisedInsertIndex), + boundaries(insertIndex - 1), + predictions(insertIndex - 1), + boundaries(insertIndex), + predictions(insertIndex), testData) } else { - predictions(insertIndex) + predictions(foundIndex) } } } @@ -113,29 +114,31 @@ class IsotonicRegressionModel ( /** * Isotonic regression. * Currently implemented using parallelized pool adjacent violators algorithm. - * Currently only univariate (single feature) algorithm supported. + * Only univariate (single feature) algorithm supported. * * Sequential PAV implementation based on: * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. * "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. + * Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf * - * Sequential PAV parallelized as per: + * Sequential PAV parallelization based on: * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. * "An approach to parallelizing isotonic regression." * Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. + * Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf */ class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** - * Constructs IsotonicRegression instance with default parameter isotonic = true - * @return New instance of IsotonicRegression + * Constructs IsotonicRegression instance with default parameter isotonic = true. + * @return New instance of IsotonicRegression. */ def this() = this(true) /** - * Sets the isotonic parameter + * Sets the isotonic parameter. * @param isotonic Isotonic (increasing) or antitonic (decreasing) sequence. - * @return The instance of IsotonicRegression + * @return This instance of IsotonicRegression. */ def setIsotonic(isotonic: Boolean): this.type = { this.isotonic = isotonic @@ -148,7 +151,6 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * @param input RDD of tuples (label, feature, weight) where label is dependent variable * for which we calculate isotonic regression, feature is independent variable * and weight represents number of measures with default 1. - * * @return Isotonic regression model. */ def run(input: RDD[(Double, Double, Double)]): IsotonicRegressionModel = { @@ -186,7 +188,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali /** * Performs a pool adjacent violators algorithm (PAV). * Uses approach with single processing of data where violators - * in previously processed data created by pooling are fixed immediatelly. + * in previously processed data created by pooling are fixed immediately. * Uses optimization of discovering monotonicity violating sequences (blocks). * * @param input Input data of tuples (label, feature, weight).