diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 3d72a8717b323..d7cbffc3be26f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -278,6 +278,9 @@ class ALS extends Estimator[ALSModel] with ALSParams { /** @group setParam */ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) + /** @group setParam */ + def setSeed(value: Long): this.type = set(seed, value) + /** * Sets both numUserBlocks and numItemBlocks to the specific value. * @group setParam diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 193dd0de444a1..4846b907e85ec 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -29,30 +29,39 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha """ Alternating Least Squares (ALS) matrix factorization. - ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices, - `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices. - The general approach is iterative. During each iteration, one of the factor matrices is held - constant, while the other is solved for using least squares. The newly-solved factor matrix is - then held constant while solving for the other factor matrix. - - This is a blocked implementation of the ALS factorization algorithm that groups the two sets - of factors (referred to as "users" and "products") into blocks and reduces communication by only - sending one copy of each user vector to each product block on each iteration, and only for the - product blocks that need that user's feature vector. This is achieved by pre-computing some - information about the ratings matrix to determine the "out-links" of each user (which blocks of - products it will contribute to) and "in-link" information for each product (which of the feature - vectors it receives from each user block it will depend on). This allows us to send only an - array of feature vectors between each user block and product block, and have the product block - find the users' ratings and update the products based on these messages. + ALS attempts to estimate the ratings matrix `R` as the product of + two lower-rank matrices, `X` and `Y`, i.e. `X * Yt = R`. Typically + these approximations are called 'factor' matrices. The general + approach is iterative. During each iteration, one of the factor + matrices is held constant, while the other is solved for using least + squares. The newly-solved factor matrix is then held constant while + solving for the other factor matrix. + + This is a blocked implementation of the ALS factorization algorithm + that groups the two sets of factors (referred to as "users" and + "products") into blocks and reduces communication by only sending + one copy of each user vector to each product block on each + iteration, and only for the product blocks that need that user's + feature vector. This is achieved by pre-computing some information + about the ratings matrix to determine the "out-links" of each user + (which blocks of products it will contribute to) and "in-link" + information for each product (which of the feature vectors it + receives from each user block it will depend on). This allows us to + send only an array of feature vectors between each user block and + product block, and have the product block find the users' ratings + and update the products based on these messages. For implicit preference data, the algorithm used is based on - "Collaborative Filtering for Implicit Feedback Datasets", available at - `http://dx.doi.org/10.1109/ICDM.2008.22`, adapted for the blocked approach used here. - - Essentially instead of finding the low-rank approximations to the rating matrix `R`, - this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if - r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of - indicated user preferences rather than explicit ratings given to items. + "Collaborative Filtering for Implicit Feedback Datasets", available + at `http://dx.doi.org/10.1109/ICDM.2008.22`, adapted for the blocked + approach used here. + + Essentially instead of finding the low-rank approximations to the + rating matrix `R`, this finds the approximations for a preference + matrix `P` where the elements of `P` are 1 if r > 0 and 0 if r <= 0. + The ratings then act as 'confidence' values related to strength of + indicated user preferences rather than explicit ratings given to + items. >>> als = ALS(rank=10, maxIter=5) >>> model = als.fit(df)