Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-14432][SQL] Add API to calculate the approximate quantiles for multiple columns #12207

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 28 additions & 16 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ def replace(self, to_replace, value, subset=None):
self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)

@since(2.0)
def approxQuantile(self, col, probabilities, relativeError):
def approxQuantile(self, cols, probabilities, relativeError):
"""
Calculates the approximate quantiles of a numerical column of a
DataFrame.
Expand All @@ -1181,18 +1181,26 @@ def approxQuantile(self, col, probabilities, relativeError):
Space-efficient Online Computation of Quantile Summaries]]
by Greenwald and Khanna.

:param col: the name of the numerical column
:param cols: str, list.
The name(s) of the numerical column(s). Can be a string of the name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can simplify this comment to: Can be a single column name, or a list of names for multiple columns. I think it's clear from the specified types that it's a string name or a list of string names.

(we mention in the method doc that it operates on numerical columns, we don't need to repeat that).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. updated.

of a single column or the list of the names of multiple columns.
:param probabilities: a list of quantile probabilities
Each number must belong to [0, 1].
For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
:param relativeError: The relative target precision to achieve
(>= 0). If set to zero, the exact quantiles are computed, which
could be very expensive. Note that values greater than 1 are
accepted but give the same result as 1.
:return: the approximate quantiles at the given probabilities
"""
if not isinstance(col, str):
raise ValueError("col should be a string.")
Each number must belong to [0, 1].
For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
:param relativeError: The relative target precision to achieve
(>= 0). If set to zero, the exact quantiles are computed, which
could be very expensive. Note that values greater than 1 are
accepted but give the same result as 1.
:return: the approximate quantiles at the given probabilities for
the given column or columns.
"""
if not isinstance(cols, (str, list, tuple)):
raise ValueError("col should be a string, list or tuple.")

if isinstance(cols, tuple):
cols = list(cols)
if isinstance(cols, list):
cols = _to_list(self._sc, cols)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could consider verifying the contents of the list as done for probabilities right bellow (but just a minor point and probably not as important - just if people pass in a list of expressions rather than strings would be nice to have a useful error message).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.


if not isinstance(probabilities, (list, tuple)):
raise ValueError("probabilities should be a list or tuple")
Expand All @@ -1207,8 +1215,12 @@ def approxQuantile(self, col, probabilities, relativeError):
raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
relativeError = float(relativeError)

jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
return list(jaq)
jaq = self._jdf.stat().approxQuantile(cols, probabilities, relativeError)
jaq = list(jaq)
for idx, a in enumerate(jaq):
if not isinstance(a, (list, float)):
jaq[idx] = list(a)
return jaq

@since(1.4)
def corr(self, col1, col2, method=None):
Expand Down Expand Up @@ -1440,8 +1452,8 @@ class DataFrameStatFunctions(object):
def __init__(self, df):
self.df = df

def approxQuantile(self, col, probabilities, relativeError):
return self.df.approxQuantile(col, probabilities, relativeError)
def approxQuantile(self, cols, probabilities, relativeError):
return self.df.approxQuantile(cols, probabilities, relativeError)

approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__

Expand Down
9 changes: 9 additions & 0 deletions python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,15 @@ def test_approxQuantile(self):
self.assertEqual(len(aq), 3)
self.assertTrue(all(isinstance(q, float) for q in aq))

aqs = df.stat.approxQuantile(["a", "a"], [0.1, 0.5, 0.9], 0.1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we add an assert that len(aqs) is 2?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added.

self.assertEqual(len(aqs), 2)
self.assertTrue(isinstance(aqs[0], list))
self.assertEqual(len(aqs[0]), 3)
self.assertTrue(all(isinstance(q, float) for q in aqs[0]))
self.assertTrue(isinstance(aqs[1], list))
self.assertEqual(len(aqs[1]), 3)
self.assertTrue(all(isinstance(q, float) for q in aqs[1]))

def test_corr(self):
import math
df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
* Online Computation of Quantile Summaries]] by Greenwald and Khanna.
*
* @param col the name of the numerical column
* @param col the name of the numerical column.
* @param probabilities a list of quantile probabilities
* Each number must belong to [0, 1].
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
* @param relativeError The relative target precision to achieve (>= 0).
* If set to zero, the exact quantiles are computed, which could be very expensive.
* Note that values greater than 1 are accepted but give the same result as 1.
* @return the approximate quantiles at the given probabilities
* @return the approximate quantiles at the given probabilities.
*
* @since 2.0.0
*/
Expand All @@ -70,6 +70,29 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
StatFunctions.multipleApproxQuantiles(df, Seq(col), probabilities, relativeError).head.toArray
}

/**
* Calculates the approximate quantiles of numerical columns of a DataFrame.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have the full doc from the above method, we should perhaps provide an @see link to the full info about the algorithm?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Updated it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the @see link work (as in links to the method with full doc)? Can you build the docs on your PR and check it? I'm not totally sure whether it will point to the doc of the other method or just to itself.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated it with specified parameter types.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this will actually show up in the generated Scaladoc HTML.

@jkbradley @mengxr do you prefer to actually make links show up in the HTML API doc? If so, then it often doesn't look good in an IDE. But to do that something like this is needed:
@see [[DataFrameStatsFunctions.approxQuantile(col:Str* approxQuantile]] for detailed description.

* @see #approxQuantile(String, Array[Double], Double) for detailed description.
*
* @param cols the names of the numerical columns.
* @param probabilities a list of quantile probabilities
* Each number must belong to [0, 1].
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
* @param relativeError The relative target precision to achieve (>= 0).
* If set to zero, the exact quantiles are computed, which could be very expensive.
* Note that values greater than 1 are accepted but give the same result as 1.
* @return the approximate quantiles at the given probabilities for given columns.
*
* @since 2.0.0
*/
def approxQuantile(
cols: Array[String],
probabilities: Array[Double],
relativeError: Double): Array[Array[Double]] = {
StatFunctions.multipleApproxQuantiles(df, cols, probabilities, relativeError)
.map(_.toArray).toArray
}

/**
* Python-friendly version of [[approxQuantile()]]
*/
Expand All @@ -80,6 +103,18 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
approxQuantile(col, probabilities.toArray, relativeError).toList.asJava
}

/**
* Python-friendly version of [[approxQuantile()]] that computes approximate quantiles
* for multiple columns.
*/
private[spark] def approxQuantile(
cols: List[String],
probabilities: List[Double],
relativeError: Double): java.util.List[java.util.List[Double]] = {
approxQuantile(cols.toArray, probabilities.toArray, relativeError)
.map(_.toList.asJava).toList.asJava
}

/**
* Calculate the sample covariance of two numerical columns of a DataFrame.
* @param col1 the name of the first column
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
assert(math.abs(s2 - q2 * n) < error_single)
assert(math.abs(d1 - 2 * q1 * n) < error_double)
assert(math.abs(d2 - 2 * q2 * n) < error_double)

// Multiple columns
val Array(Array(ms1, ms2), Array(md1, md2)) =
df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilon)

assert(math.abs(ms1 - q1 * n) < error_single)
assert(math.abs(ms2 - q2 * n) < error_single)
assert(math.abs(md1 - 2 * q1 * n) < error_double)
assert(math.abs(md2 - 2 * q2 * n) < error_double)
}
}

Expand Down