diff --git a/docs/source/user_guide/using_aggregations.rst b/docs/source/user_guide/using_aggregations.rst index ce07750d10..03faccc11d 100644 --- a/docs/source/user_guide/using_aggregations.rst +++ b/docs/source/user_guide/using_aggregations.rst @@ -282,6 +282,35 @@ collection: print(dataset.mean("predictions.detections.confidence")) # 0.34994137249820706 +.. _aggregations-quantiles: + +Quantiles +_________ + +You can use the +:meth:`quantiles() ` +aggregation to compute the quantile(s) of the (non-``None``) values of a field +in a collection: + +.. code-block:: python + :linenos: + + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart") + + # Compute quantiles of the `uniqueness` field + print(dataset.quantiles("uniqueness", [0.25, 0.5, 0.75, 0.9])) + # [0.22027, 0.33771, 0.62554, 0.69488] + + # Compute quantiles of detection confidence in the `predictions` field + quantiles = dataset.quantiles( + "predictions.detections.confidence", + [0.25, 0.5, 0.75, 0.9], + ) + print(quantiles) + # [0.09231, 0.20251, 0.56273, 0.94354] + .. _aggregations-std: Standard deviation diff --git a/fiftyone/__public__.py b/fiftyone/__public__.py index 4326feb85c..813d20dbef 100644 --- a/fiftyone/__public__.py +++ b/fiftyone/__public__.py @@ -22,6 +22,7 @@ Distinct, HistogramValues, Mean, + Quantiles, Std, Sum, Values, diff --git a/fiftyone/core/aggregations.py b/fiftyone/core/aggregations.py index 16703a6252..4b3e31b77d 100644 --- a/fiftyone/core/aggregations.py +++ b/fiftyone/core/aggregations.py @@ -17,6 +17,7 @@ import fiftyone.core.expressions as foe from fiftyone.core.expressions import VALUE +from fiftyone.core.expressions import ViewExpression as E from fiftyone.core.expressions import ViewField as F import fiftyone.core.fields as fof import fiftyone.core.media as fom @@ -1332,6 +1333,171 @@ def to_mongo(self, sample_collection): return pipeline +class Quantiles(Aggregation): + """Computes the quantile(s) of the field values of a collection. + + ``None``-valued fields are ignored. + + This aggregation is typically applied to *numeric* field types (or lists of + such types): + + - :class:`fiftyone.core.fields.IntField` + - :class:`fiftyone.core.fields.FloatField` + + Examples:: + + import fiftyone as fo + from fiftyone import ViewField as F + + dataset = fo.Dataset() + dataset.add_samples( + [ + fo.Sample( + filepath="/path/to/image1.png", + numeric_field=1.0, + numeric_list_field=[1, 2, 3], + ), + fo.Sample( + filepath="/path/to/image2.png", + numeric_field=4.0, + numeric_list_field=[1, 2], + ), + fo.Sample( + filepath="/path/to/image3.png", + numeric_field=None, + numeric_list_field=None, + ), + ] + ) + + # + # Compute the quantiles of a numeric field + # + + aggregation = fo.Quantiles("numeric_field", [0.1, 0.5, 0.9]) + quantiles = dataset.aggregate(aggregation) + print(quantiles) # the quantiles + + # + # Compute the quantiles of a numeric list field + # + + aggregation = fo.Quantiles("numeric_list_field", [0.1, 0.5, 0.9]) + quantiles = dataset.aggregate(aggregation) + print(quantiles) # the quantiles + + # + # Compute the mean of a transformation of a numeric field + # + + aggregation = fo.Quantiles(2 * (F("numeric_field") + 1), [0.1, 0.5, 0.9]) + quantiles = dataset.aggregate(aggregation) + print(quantiles) # the quantiles + + Args: + field_or_expr: a field name, ``embedded.field.name``, + :class:`fiftyone.core.expressions.ViewExpression`, or + `MongoDB expression `_ + defining the field or expression to aggregate + quantiles: the quantile or iterable of quantiles to compute. Each + quantile must be a numeric value in ``[0, 1]`` + expr (None): a :class:`fiftyone.core.expressions.ViewExpression` or + `MongoDB expression `_ + to apply to ``field_or_expr`` (which must be a field) before + aggregating + safe (False): whether to ignore nan/inf values when dealing with + floating point values + """ + + def __init__(self, field_or_expr, quantiles, expr=None, safe=False): + quantiles_list, is_scalar = self._parse_quantiles(quantiles) + + super().__init__(field_or_expr, expr=expr, safe=safe) + self._quantiles = quantiles + + self._quantiles_list = quantiles_list + self._is_scalar = is_scalar + + def _kwargs(self): + return [ + ["field_or_expr", self._field_name], + ["quantiles", self._quantiles], + ["expr", self._expr], + ["safe", self._safe], + ] + + def default_result(self): + """Returns the default result for this aggregation. + + Returns: + ``None`` or ``[None, None, None]`` + """ + if self._is_scalar: + return None + + return [None] * len(self._quantiles_list) + + def parse_result(self, d): + """Parses the output of :meth:`to_mongo`. + + Args: + d: the result dict + + Returns: + the quantile or list of quantiles + """ + if self._is_scalar: + return d["quantiles"][0] + + return d["quantiles"] + + def to_mongo(self, sample_collection): + path, pipeline, _, id_to_str, _ = _parse_field_and_expr( + sample_collection, + self._field_name, + expr=self._expr, + safe=self._safe, + ) + + if id_to_str: + value = {"$toString": "$" + path} + else: + value = "$" + path + + # Compute quantile + # Note that we don't need to explicitly handle empty `values` here + # because the `group` stage only outputs a document if there's at least + # one value to compute on! + array = F("values").sort(numeric=True) + idx = ((F() * array.length()).ceil() - 1).max(0) + quantile_expr = array.let_in(E(self._quantiles_list).map(array[idx])) + + pipeline.extend( + [ + {"$match": {"$expr": {"$isNumber": value}}}, + {"$group": {"_id": None, "values": {"$push": value}}}, + {"$project": {"quantiles": quantile_expr.to_mongo()}}, + ] + ) + + return pipeline + + def _parse_quantiles(self, quantiles): + is_scalar = not etau.is_container(quantiles) + + if is_scalar: + quantiles = [quantiles] + else: + quantiles = list(quantiles) + + if any(not etau.is_numeric(q) or q < 0 or q > 1 for q in quantiles): + raise ValueError( + "Quantiles must be numbers in [0, 1]; found %s" % quantiles + ) + + return quantiles, is_scalar + + class Std(Aggregation): """Computes the standard deviation of the field values of a collection. diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py index 3f748c00f0..60df3fc00c 100644 --- a/fiftyone/core/collections.py +++ b/fiftyone/core/collections.py @@ -5786,6 +5786,87 @@ def mean(self, field_or_expr, expr=None, safe=False): ) return self._make_and_aggregate(make, field_or_expr) + @aggregation + def quantiles(self, field_or_expr, quantiles, expr=None, safe=False): + """Computes the quantile(s) of the field values of a collection. + + ``None``-valued fields are ignored. + + This aggregation is typically applied to *numeric* field types (or + lists of such types): + + - :class:`fiftyone.core.fields.IntField` + - :class:`fiftyone.core.fields.FloatField` + + Examples:: + + import fiftyone as fo + from fiftyone import ViewField as F + + dataset = fo.Dataset() + dataset.add_samples( + [ + fo.Sample( + filepath="/path/to/image1.png", + numeric_field=1.0, + numeric_list_field=[1, 2, 3], + ), + fo.Sample( + filepath="/path/to/image2.png", + numeric_field=4.0, + numeric_list_field=[1, 2], + ), + fo.Sample( + filepath="/path/to/image3.png", + numeric_field=None, + numeric_list_field=None, + ), + ] + ) + + # + # Compute the quantiles of a numeric field + # + + quantiles = dataset.quantiles("numeric_field", [0.1, 0.5, 0.9]) + print(quantiles) # the quantiles + + # + # Compute the quantiles of a numeric list field + # + + quantiles = dataset.quantiles("numeric_list_field", [0.1, 0.5, 0.9]) + print(quantiles) # the quantiles + + # + # Compute the mean of a transformation of a numeric field + # + + quantiles = dataset.quantiles(2 * (F("numeric_field") + 1), [0.1, 0.5, 0.9]) + print(quantiles) # the quantiles + + Args: + field_or_expr: a field name, ``embedded.field.name``, + :class:`fiftyone.core.expressions.ViewExpression`, or + `MongoDB expression `_ + defining the field or expression to aggregate + quantiles: the quantile or iterable of quantiles to compute. Each + quantile must be a numeric value in ``[0, 1]`` + expr (None): a :class:`fiftyone.core.expressions.ViewExpression` or + `MongoDB expression `_ + to apply to ``field_or_expr`` (which must be a field) before + aggregating + safe (False): whether to ignore nan/inf values when dealing with + floating point values + + Returns: + the quantile or list of quantiles + """ + make = lambda field_or_expr: foa.Quantiles( + field_or_expr, quantiles, expr=expr, safe=safe + ) + return self._make_and_aggregate(make, field_or_expr) + @aggregation def std(self, field_or_expr, expr=None, safe=False, sample=False): """Computes the standard deviation of the field values of the diff --git a/fiftyone/core/expressions.py b/fiftyone/core/expressions.py index 8e33177947..10e3c8f24c 100644 --- a/fiftyone/core/expressions.py +++ b/fiftyone/core/expressions.py @@ -2584,7 +2584,7 @@ def reverse(self): """ return ViewExpression({"$reverseArray": self}) - def sort(self, key=None, reverse=False): + def sort(self, key=None, numeric=False, reverse=False): """Sorts this expression, which must resolve to an array. If no ``key`` is provided, this array must contain elements whose @@ -2628,7 +2628,7 @@ def sort(self, key=None, reverse=False): view = dataset.set_field( "predictions.detections", - F("detections").sort(key="confidence", reverse=True) + F("detections").sort(key="confidence", numeric=True, reverse=True) ) sample = view.first() @@ -2637,13 +2637,23 @@ def sort(self, key=None, reverse=False): Args: key (None): an optional field or ``embedded.field.name`` to sort by + numeric (False): whether the array contains numeric values. By + default, the values will be sorted alphabetically by their + string representations reverse (False): whether to sort in descending order Returns: a :class:`ViewExpression` """ if key is not None: - comp = "(a, b) => a.{key} - b.{key}".format(key=key) + if numeric: + comp = "(a, b) => a.{key} - b.{key}" + else: + comp = "(a, b) => ('' + a.{key}).localeCompare(b.{key})" + + comp = comp.format(key=key) + elif numeric: + comp = "(a, b) => a - b" else: comp = "" diff --git a/tests/unittests/aggregation_tests.py b/tests/unittests/aggregation_tests.py index 5ef0b84a77..cb21d65b7c 100644 --- a/tests/unittests/aggregation_tests.py +++ b/tests/unittests/aggregation_tests.py @@ -9,6 +9,7 @@ import math from bson import ObjectId +import numpy as np import unittest import fiftyone as fo @@ -220,6 +221,51 @@ def test_mean(self): self.assertAlmostEqual(d.mean(2.0 * (F("numeric_field") + 1)), 6.0) + @drop_datasets + def test_quantiles(self): + d = fo.Dataset() + d.add_sample_field("numeric_field", fo.IntField) + self.assertIsNone(d.quantiles("numeric_field", 0.5)) + self.assertListEqual(d.quantiles("numeric_field", [0.5]), [None]) + + s = fo.Sample(filepath="image.jpeg", numeric_field=1) + d.add_sample(s) + self.assertAlmostEqual(d.quantiles("numeric_field", 0.5), 1) + + s = fo.Sample(filepath="image2.jpeg", numeric_field=2) + d.add_sample(s) + + q = np.linspace(0, 1, 11) + + results1 = d.quantiles("numeric_field", q) + + # only available in `numpy>=1.22` + # results2 = np.quantile([1, 2], q, method="inverted_cdf") + results2 = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + + self.assertEqual(len(results1), len(results2)) + for r1, r2 in zip(results1, results2): + self.assertAlmostEqual(r1, r2) + + results1 = d.quantiles(2.0 * (F("numeric_field") + 1), q) + + # only available in `numpy>=1.22` + # results2 = np.quantile([4, 6], q, method="inverted_cdf") + results2 = [4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6] + + self.assertEqual(len(results1), len(results2)) + for r1, r2 in zip(results1, results2): + self.assertAlmostEqual(r1, r2) + + with self.assertRaises(ValueError): + d.quantiles("numeric_field", "bad-value") + + with self.assertRaises(ValueError): + d.quantiles("numeric_field", -1) + + with self.assertRaises(ValueError): + d.quantiles("numeric_field", 2) + @drop_datasets def test_std(self): d = fo.Dataset() @@ -443,6 +489,11 @@ def test_nan_inf(self): self.assertTrue(math.isnan(dataset.mean("float"))) self.assertTrue(math.isnan(dataset.sum("float"))) self.assertTrue(math.isnan(dataset.std("float"))) + self.assertTrue(math.isnan(dataset.quantiles("float", 0))) + self.assertTrue(math.isnan(dataset.quantiles("float", 0.25))) + self.assertTrue(math.isinf(dataset.quantiles("float", 0.50))) + self.assertAlmostEqual(dataset.quantiles("float", 0.75), 1.0) + self.assertTrue(math.isinf(dataset.quantiles("float", 1))) counts, edges, other = dataset.histogram_values("float") self.assertEqual(other, 5) # captures None, nan, inf @@ -460,6 +511,10 @@ def test_nan_inf(self): self.assertAlmostEqual(dataset.sum("float", safe=True), 1.0) self.assertAlmostEqual(dataset.std("float", safe=True), 0.0) + self.assertAlmostEqual(dataset.quantiles("float", 0, safe=True), 1.0) + self.assertAlmostEqual(dataset.quantiles("float", 1, safe=True), 1.0) + self.assertAlmostEqual(dataset.quantiles("float", 0.5, safe=True), 1.0) + @drop_datasets def test_object_ids(self): dataset = fo.Dataset()