Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-18282][ML][PYSPARK] Add python clustering summaries for GMM and BKM #15777

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,7 @@ class BisectingKMeansModel private[ml] (
@Since("2.0.0")
override def copy(extra: ParamMap): BisectingKMeansModel = {
val copied = copyValues(new BisectingKMeansModel(uid, parentModel), extra)
if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
copied.setParent(this.parent)
copied.setSummary(trainingSummary).setParent(this.parent)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks better. Could you make the change for Scala LiR, LoR, GLM and KMeans as well? I think they should be consistent. Thanks.

Copy link
Contributor Author

@sethah sethah Nov 21, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated. I also added tests. Thanks for reviewing!

}

@Since("2.0.0")
Expand Down Expand Up @@ -132,8 +131,8 @@ class BisectingKMeansModel private[ml] (

private var trainingSummary: Option[BisectingKMeansSummary] = None

private[clustering] def setSummary(summary: BisectingKMeansSummary): this.type = {
this.trainingSummary = Some(summary)
private[clustering] def setSummary(summary: Option[BisectingKMeansSummary]): this.type = {
this.trainingSummary = summary
this
}

Expand Down Expand Up @@ -265,7 +264,7 @@ class BisectingKMeans @Since("2.0.0") (
val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
val summary = new BisectingKMeansSummary(
model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
model.setSummary(summary)
model.setSummary(Some(summary))
instr.logSuccess(model)
model
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ class GaussianMixtureModel private[ml] (
@Since("2.0.0")
override def copy(extra: ParamMap): GaussianMixtureModel = {
val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)
if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
copied.setParent(this.parent)
copied.setSummary(trainingSummary).setParent(this.parent)
}

@Since("2.0.0")
Expand Down Expand Up @@ -150,8 +149,8 @@ class GaussianMixtureModel private[ml] (

private var trainingSummary: Option[GaussianMixtureSummary] = None

private[clustering] def setSummary(summary: GaussianMixtureSummary): this.type = {
this.trainingSummary = Some(summary)
private[clustering] def setSummary(summary: Option[GaussianMixtureSummary]): this.type = {
this.trainingSummary = summary
this
}

Expand Down Expand Up @@ -340,7 +339,7 @@ class GaussianMixture @Since("2.0.0") (
.setParent(this)
val summary = new GaussianMixtureSummary(model.transform(dataset),
$(predictionCol), $(probabilityCol), $(featuresCol), $(k))
model.setSummary(summary)
model.setSummary(Some(summary))
instr.logNumFeatures(model.gaussians.head.mean.size)
instr.logSuccess(model)
model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ class BisectingKMeansSuite
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))

model.setSummary(None)
assert(!model.hasSummary)
}

test("read/write") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))

model.setSummary(None)
assert(!model.hasSummary)
}

test("read/write") {
Expand Down
15 changes: 9 additions & 6 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,13 +309,16 @@ def interceptVector(self):
@since("2.0.0")
def summary(self):
"""
Gets summary (e.g. residuals, mse, r-squared ) of model on
training set. An exception is thrown if
`trainingSummary is None`.
Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model
trained on the training set. An exception is thrown if `trainingSummary is None`.
"""
java_blrt_summary = self._call_java("summary")
# Note: Once multiclass is added, update this to return correct summary
return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
if self.hasSummary:
java_blrt_summary = self._call_java("summary")
# Note: Once multiclass is added, update this to return correct summary
return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
else:
raise RuntimeError("No training summary available for this %s" %
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before, this would throw a Py4JJavaError. I think it's slightly better to throw a RuntimeError here as is done in Scala.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think thats generally a good improvement, the Py4J errors are often confusing to end users.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this change, we should always throw an exception easy to understand by users.

self.__class__.__name__)

@property
@since("2.0.0")
Expand Down
162 changes: 159 additions & 3 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,74 @@

from pyspark import since, keyword_only
from pyspark.ml.util import *
from pyspark.ml.wrapper import JavaEstimator, JavaModel
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper
from pyspark.ml.param.shared import *
from pyspark.ml.common import inherit_doc

__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
__all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary',
'KMeans', 'KMeansModel',
'GaussianMixture', 'GaussianMixtureModel',
'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary',
'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']


class ClusteringSummary(JavaWrapper):
"""
.. note:: Experimental

Clustering results for a given model.

.. versionadded:: 2.1.0
"""

@property
@since("2.1.0")
def predictionCol(self):
"""
Name for column of predicted clusters in `predictions`.
"""
return self._call_java("predictionCol")

@property
@since("2.1.0")
def predictions(self):
"""
DataFrame produced by the model's `transform` method.
"""
return self._call_java("predictions")

@property
@since("2.1.0")
def featuresCol(self):
"""
Name for column of features in `predictions`.
"""
return self._call_java("featuresCol")

@property
@since("2.1.0")
def k(self):
"""
The number of clusters the model was trained with.
"""
return self._call_java("k")

@property
@since("2.1.0")
def cluster(self):
"""
DataFrame of predicted cluster centers for each training data point.
"""
return self._call_java("cluster")

@property
@since("2.1.0")
def clusterSizes(self):
"""
Size of (number of data points in) each cluster.
"""
return self._call_java("clusterSizes")


class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental
Expand Down Expand Up @@ -56,6 +114,28 @@ def gaussiansDF(self):
"""
return self._call_java("gaussiansDF")

@property
@since("2.1.0")
def hasSummary(self):
"""
Indicates whether a training summary exists for this model
instance.
"""
return self._call_java("hasSummary")

@property
@since("2.1.0")
def summary(self):
"""
Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
training set. An exception is thrown if no summary exists.
"""
if self.hasSummary:
return GaussianMixtureSummary(self._call_java("summary"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo, should be BisectingKMeansSummary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, good catch!

else:
raise RuntimeError("No training summary available for this %s" %
self.__class__.__name__)


@inherit_doc
class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
Expand Down Expand Up @@ -92,6 +172,13 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
>>> gm = GaussianMixture(k=3, tol=0.0001,
... maxIter=10, seed=10)
>>> model = gm.fit(df)
>>> model.hasSummary
True
>>> summary = model.summary
>>> summary.k
3
>>> summary.clusterSizes
[2, 2, 2]
>>> weights = model.weights
>>> len(weights)
3
Expand All @@ -118,6 +205,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
>>> model_path = temp_path + "/gmm_model"
>>> model.save(model_path)
>>> model2 = GaussianMixtureModel.load(model_path)
>>> model2.hasSummary
False
>>> model2.weights == model.weights
True
>>> model2.gaussiansDF.show()
Expand Down Expand Up @@ -181,6 +270,32 @@ def getK(self):
return self.getOrDefault(self.k)


class GaussianMixtureSummary(ClusteringSummary):
"""
.. note:: Experimental

Gaussian mixture clustering results for a given model.

.. versionadded:: 2.1.0
"""

@property
@since("2.1.0")
def probabilityCol(self):
"""
Name for column of predicted probability of each cluster in `predictions`.
"""
return self._call_java("probabilityCol")

@property
@since("2.1.0")
def probability(self):
"""
DataFrame of probabilities of each cluster for each training data point.
"""
return self._call_java("probability")


class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
Model fitted by KMeans.
Expand Down Expand Up @@ -346,6 +461,27 @@ def computeCost(self, dataset):
"""
return self._call_java("computeCost", dataset)

@property
@since("2.1.0")
def hasSummary(self):
"""
Indicates whether a training summary exists for this model instance.
"""
return self._call_java("hasSummary")

@property
@since("2.1.0")
def summary(self):
"""
Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
training set. An exception is thrown if no summary exists.
"""
if self.hasSummary:
return BisectingKMeansSummary(self._call_java("summary"))
else:
raise RuntimeError("No training summary available for this %s" %
self.__class__.__name__)


@inherit_doc
class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
Expand Down Expand Up @@ -373,6 +509,13 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
2
>>> model.computeCost(df)
2.000...
>>> model.hasSummary
True
>>> summary = model.summary
>>> summary.k
2
>>> summary.clusterSizes
[2, 2]
>>> transformed = model.transform(df).select("features", "prediction")
>>> rows = transformed.collect()
>>> rows[0].prediction == rows[1].prediction
Expand All @@ -387,6 +530,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
>>> model_path = temp_path + "/bkm_model"
>>> model.save(model_path)
>>> model2 = BisectingKMeansModel.load(model_path)
>>> model2.hasSummary
False
>>> model.clusterCenters()[0] == model2.clusterCenters()[0]
array([ True, True], dtype=bool)
>>> model.clusterCenters()[1] == model2.clusterCenters()[1]
Expand Down Expand Up @@ -460,6 +605,17 @@ def _create_model(self, java_model):
return BisectingKMeansModel(java_model)


class BisectingKMeansSummary(ClusteringSummary):
"""
.. note:: Experimental

Bisecting KMeans clustering results for a given model.

.. versionadded:: 2.1.0
"""
pass


@inherit_doc
class LDAModel(JavaModel):
"""
Expand Down
16 changes: 12 additions & 4 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,12 @@ def summary(self):
training set. An exception is thrown if
`trainingSummary is None`.
"""
java_lrt_summary = self._call_java("summary")
return LinearRegressionTrainingSummary(java_lrt_summary)
if self.hasSummary:
java_lrt_summary = self._call_java("summary")
return LinearRegressionTrainingSummary(java_lrt_summary)
else:
raise RuntimeError("No training summary available for this %s" %
self.__class__.__name__)

@property
@since("2.0.0")
Expand Down Expand Up @@ -1459,8 +1463,12 @@ def summary(self):
training set. An exception is thrown if
`trainingSummary is None`.
"""
java_glrt_summary = self._call_java("summary")
return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
if self.hasSummary:
java_glrt_summary = self._call_java("summary")
return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
else:
raise RuntimeError("No training summary available for this %s" %
self.__class__.__name__)

@property
@since("2.0.0")
Expand Down
Loading