Skip to content

Commit

Permalink
Move ClusteringSummary to a separate file.
Browse files Browse the repository at this point in the history
  • Loading branch information
yanboliang committed Oct 26, 2016
1 parent f13f240 commit 946ee73
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 35 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.ml.clustering

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.{DataFrame, Row}

/**
* :: Experimental ::
* Summary of clustering algorithms.
*
* @param predictions [[DataFrame]] produced by model.transform().
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
*/
@Experimental
class ClusteringSummary private[clustering] (
@transient val predictions: DataFrame,
val predictionCol: String,
val featuresCol: String,
val k: Int) extends Serializable {

/**
* Cluster centers of the transformed data.
*/
@transient lazy val cluster: DataFrame = predictions.select(predictionCol)

/**
* Size of (number of data points in) each cluster.
*/
lazy val clusterSizes: Array[Long] = {
val sizes = Array.fill[Long](k)(0)
cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
case Row(cluster: Int, count: Long) => sizes(cluster) = count
}
sizes
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
class GaussianMixtureSummary private[clustering] (
predictions: DataFrame,
predictionCol: String,
val probabilityCol: String,
@Since("2.0.0") val probabilityCol: String,
featuresCol: String,
k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {

Expand Down
34 changes: 0 additions & 34 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
Original file line number Diff line number Diff line change
Expand Up @@ -358,37 +358,3 @@ class KMeansSummary private[clustering] (
predictionCol: String,
featuresCol: String,
k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)

/**
* :: Experimental ::
* Summary of clustering algorithms.
*
* @param predictions [[DataFrame]] produced by model.transform().
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
*/
@Experimental
class ClusteringSummary private[clustering] (
@transient val predictions: DataFrame,
val predictionCol: String,
val featuresCol: String,
val k: Int) extends Serializable {

/**
* Cluster centers of the transformed data.
*/
@transient lazy val cluster: DataFrame = predictions.select(predictionCol)

/**
* Size of (number of data points in) each cluster.
*/
lazy val clusterSizes: Array[Long] = {
val sizes = Array.fill[Long](k)(0)
cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
case Row(cluster: Int, count: Long) => sizes(cluster) = count
}
sizes
}

}

0 comments on commit 946ee73

Please sign in to comment.