Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding quantiles aggregation #1937

Merged
merged 9 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions docs/source/user_guide/using_aggregations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,35 @@ collection:
print(dataset.mean("predictions.detections.confidence"))
# 0.34994137249820706

.. _aggregations-quantiles:

Quantiles
_________

You can use the
:meth:`quantiles() <fiftyone.core.collections.SampleCollection.quantiles>`
aggregation to compute the quantile(s) of the (non-``None``) values of a field
in a collection:

.. code-block:: python
:linenos:

import fiftyone.zoo as foz

dataset = foz.load_zoo_dataset("quickstart")

# Compute quantiles of the `uniqueness` field
print(dataset.quantiles("uniqueness", [0.25, 0.5, 0.75, 0.9]))
# [0.22027, 0.33771, 0.62554, 0.69488]

# Compute quantiles of detection confidence in the `predictions` field
quantiles = dataset.quantiles(
"predictions.detections.confidence",
[0.25, 0.5, 0.75, 0.9],
)
print(quantiles)
# [0.09231, 0.20251, 0.56273, 0.94354]

.. _aggregations-std:

Standard deviation
Expand Down
1 change: 1 addition & 0 deletions fiftyone/__public__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
Distinct,
HistogramValues,
Mean,
Quantiles,
Std,
Sum,
Values,
Expand Down
166 changes: 166 additions & 0 deletions fiftyone/core/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import fiftyone.core.expressions as foe
from fiftyone.core.expressions import VALUE
from fiftyone.core.expressions import ViewExpression as E
from fiftyone.core.expressions import ViewField as F
import fiftyone.core.fields as fof
import fiftyone.core.media as fom
Expand Down Expand Up @@ -1332,6 +1333,171 @@ def to_mongo(self, sample_collection):
return pipeline


class Quantiles(Aggregation):
"""Computes the quantile(s) of the field values of a collection.

``None``-valued fields are ignored.

This aggregation is typically applied to *numeric* field types (or lists of
such types):

- :class:`fiftyone.core.fields.IntField`
- :class:`fiftyone.core.fields.FloatField`

Examples::

import fiftyone as fo
from fiftyone import ViewField as F

dataset = fo.Dataset()
dataset.add_samples(
[
fo.Sample(
filepath="/path/to/image1.png",
numeric_field=1.0,
numeric_list_field=[1, 2, 3],
),
fo.Sample(
filepath="/path/to/image2.png",
numeric_field=4.0,
numeric_list_field=[1, 2],
),
fo.Sample(
filepath="/path/to/image3.png",
numeric_field=None,
numeric_list_field=None,
),
]
)

#
# Compute the quantiles of a numeric field
#

aggregation = fo.Quantiles("numeric_field", [0.1, 0.5, 0.9])
quantiles = dataset.aggregate(aggregation)
print(quantiles) # the quantiles

#
# Compute the quantiles of a numeric list field
#

aggregation = fo.Quantiles("numeric_list_field", [0.1, 0.5, 0.9])
quantiles = dataset.aggregate(aggregation)
print(quantiles) # the quantiles

#
# Compute the mean of a transformation of a numeric field
#

aggregation = fo.Quantiles(2 * (F("numeric_field") + 1), [0.1, 0.5, 0.9])
quantiles = dataset.aggregate(aggregation)
print(quantiles) # the quantiles

Args:
field_or_expr: a field name, ``embedded.field.name``,
:class:`fiftyone.core.expressions.ViewExpression`, or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
defining the field or expression to aggregate
quantiles: the quantile or iterable of quantiles to compute. Each
quantile must be a numeric value in ``[0, 1]``
expr (None): a :class:`fiftyone.core.expressions.ViewExpression` or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
to apply to ``field_or_expr`` (which must be a field) before
aggregating
safe (False): whether to ignore nan/inf values when dealing with
floating point values
"""

def __init__(self, field_or_expr, quantiles, expr=None, safe=False):
quantiles_list, is_scalar = self._parse_quantiles(quantiles)

super().__init__(field_or_expr, expr=expr, safe=safe)
self._quantiles = quantiles

self._quantiles_list = quantiles_list
self._is_scalar = is_scalar

def _kwargs(self):
return [
["field_or_expr", self._field_name],
["quantiles", self._quantiles],
["expr", self._expr],
["safe", self._safe],
]

def default_result(self):
"""Returns the default result for this aggregation.

Returns:
``None`` or ``[None, None, None]``
"""
if self._is_scalar:
return None

return [None] * len(self._quantiles_list)

def parse_result(self, d):
"""Parses the output of :meth:`to_mongo`.

Args:
d: the result dict

Returns:
the quantile or list of quantiles
"""
if self._is_scalar:
return d["quantiles"][0]

return d["quantiles"]

def to_mongo(self, sample_collection):
path, pipeline, _, id_to_str, _ = _parse_field_and_expr(
sample_collection,
self._field_name,
expr=self._expr,
safe=self._safe,
)

if id_to_str:
value = {"$toString": "$" + path}
else:
value = "$" + path

# Compute quantile
# Note that we don't need to explicitly handle empty `values` here
# because the `group` stage only outputs a document if there's at least
# one value to compute on!
array = F("values").sort(numeric=True)
idx = ((F() * array.length()).ceil() - 1).max(0)
quantile_expr = array.let_in(E(self._quantiles_list).map(array[idx]))

pipeline.extend(
[
{"$match": {"$expr": {"$isNumber": value}}},
{"$group": {"_id": None, "values": {"$push": value}}},
{"$project": {"quantiles": quantile_expr.to_mongo()}},
]
)

return pipeline

def _parse_quantiles(self, quantiles):
is_scalar = not etau.is_container(quantiles)

if is_scalar:
quantiles = [quantiles]
else:
quantiles = list(quantiles)

if any(not etau.is_numeric(q) or q < 0 or q > 1 for q in quantiles):
raise ValueError(
"Quantiles must be numbers in [0, 1]; found %s" % quantiles
)

return quantiles, is_scalar


class Std(Aggregation):
"""Computes the standard deviation of the field values of a collection.

Expand Down
81 changes: 81 additions & 0 deletions fiftyone/core/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -5786,6 +5786,87 @@ def mean(self, field_or_expr, expr=None, safe=False):
)
return self._make_and_aggregate(make, field_or_expr)

@aggregation
def quantiles(self, field_or_expr, quantiles, expr=None, safe=False):
"""Computes the quantile(s) of the field values of a collection.

``None``-valued fields are ignored.

This aggregation is typically applied to *numeric* field types (or
lists of such types):

- :class:`fiftyone.core.fields.IntField`
- :class:`fiftyone.core.fields.FloatField`

Examples::

import fiftyone as fo
from fiftyone import ViewField as F

dataset = fo.Dataset()
dataset.add_samples(
[
fo.Sample(
filepath="/path/to/image1.png",
numeric_field=1.0,
numeric_list_field=[1, 2, 3],
),
fo.Sample(
filepath="/path/to/image2.png",
numeric_field=4.0,
numeric_list_field=[1, 2],
),
fo.Sample(
filepath="/path/to/image3.png",
numeric_field=None,
numeric_list_field=None,
),
]
)

#
# Compute the quantiles of a numeric field
#

quantiles = dataset.quantiles("numeric_field", [0.1, 0.5, 0.9])
print(quantiles) # the quantiles

#
# Compute the quantiles of a numeric list field
#

quantiles = dataset.quantiles("numeric_list_field", [0.1, 0.5, 0.9])
print(quantiles) # the quantiles

#
# Compute the mean of a transformation of a numeric field
#

quantiles = dataset.quantiles(2 * (F("numeric_field") + 1), [0.1, 0.5, 0.9])
print(quantiles) # the quantiles

Args:
field_or_expr: a field name, ``embedded.field.name``,
:class:`fiftyone.core.expressions.ViewExpression`, or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
defining the field or expression to aggregate
quantiles: the quantile or iterable of quantiles to compute. Each
quantile must be a numeric value in ``[0, 1]``
expr (None): a :class:`fiftyone.core.expressions.ViewExpression` or
`MongoDB expression <https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#aggregation-expressions>`_
to apply to ``field_or_expr`` (which must be a field) before
aggregating
safe (False): whether to ignore nan/inf values when dealing with
floating point values

Returns:
the quantile or list of quantiles
"""
make = lambda field_or_expr: foa.Quantiles(
field_or_expr, quantiles, expr=expr, safe=safe
)
return self._make_and_aggregate(make, field_or_expr)

@aggregation
def std(self, field_or_expr, expr=None, safe=False, sample=False):
"""Computes the standard deviation of the field values of the
Expand Down
16 changes: 13 additions & 3 deletions fiftyone/core/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2584,7 +2584,7 @@ def reverse(self):
"""
return ViewExpression({"$reverseArray": self})

def sort(self, key=None, reverse=False):
def sort(self, key=None, numeric=False, reverse=False):
"""Sorts this expression, which must resolve to an array.

If no ``key`` is provided, this array must contain elements whose
Expand Down Expand Up @@ -2628,7 +2628,7 @@ def sort(self, key=None, reverse=False):

view = dataset.set_field(
"predictions.detections",
F("detections").sort(key="confidence", reverse=True)
F("detections").sort(key="confidence", numeric=True, reverse=True)
)

sample = view.first()
Expand All @@ -2637,13 +2637,23 @@ def sort(self, key=None, reverse=False):

Args:
key (None): an optional field or ``embedded.field.name`` to sort by
numeric (False): whether the array contains numeric values. By
default, the values will be sorted alphabetically by their
string representations
reverse (False): whether to sort in descending order

Returns:
a :class:`ViewExpression`
"""
if key is not None:
comp = "(a, b) => a.{key} - b.{key}".format(key=key)
if numeric:
comp = "(a, b) => a.{key} - b.{key}"
else:
comp = "(a, b) => ('' + a.{key}).localeCompare(b.{key})"

comp = comp.format(key=key)
elif numeric:
comp = "(a, b) => a - b"
else:
comp = ""

Expand Down
Loading