deepset-ai · silvanocerza · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
@@ -19,6 +19,7 @@ class StatisticalMetric(Enum):
     EM = "exact_match"
     RECALL_SINGLE_HIT = "recall_single_hit"
     RECALL_MULTI_HIT = "recall_multi_hit"
+    MRR = "mean_reciprocal_rank"
 
     @classmethod
     def from_str(cls, metric: str) -> "StatisticalMetric":
@@ -55,6 +56,7 @@ def __init__(self, metric: Union[str, StatisticalMetric]):
             StatisticalMetric.EM: self._exact_match,
             StatisticalMetric.RECALL_SINGLE_HIT: self._recall_single_hit,
             StatisticalMetric.RECALL_MULTI_HIT: self._recall_multi_hit,
+            StatisticalMetric.MRR: self._mrr,
         }[self._metric]
 
     def to_dict(self) -> Dict[str, Any]:
@@ -111,7 +113,7 @@ def _f1(labels: List[str], predictions: List[str]):
     @staticmethod
     def _exact_match(labels: List[str], predictions: List[str]) -> float:
         """
-        Measure the proportion of cases where predictiond is identical to the the expected label.
+        Measure the proportion of cases where prediction is identical to the the expected label.
         """
         if len(labels) != len(predictions):
             raise ValueError("The number of predictions and labels must be the same.")
@@ -150,3 +152,20 @@ def _recall_multi_hit(labels: List[str], predictions: List[str]) -> float:
                 correct_retrievals += 1
 
         return correct_retrievals / len(labels)
+
+    @staticmethod
+    def _mrr(labels: List[str], predictions: List[str]) -> float:
+        """
+        Measures the mean reciprocal rank of times a label is present in at least one or more predictions.
+        """
+        if len(labels) == 0:
+            return 0.0
+
+        mrr_sum = 0.0
+        for label in labels:
+            for rank, prediction in enumerate(predictions):
+                if label in prediction:
+                    mrr_sum += 1 / (rank + 1)
+                    break
+
+        return mrr_sum / len(labels)
@@ -0,0 +1,5 @@
+---
+features:
+  - |
+    Add support for Mean Reciprocal Rank (MRR) Metric to `StatisticalEvaluator`.
+    MRR measures the mean reciprocal rank of times a label is present in at least one or more predictions.
@@ -189,3 +189,37 @@ def test_run_with_empty_predictions(self):
         result = evaluator.run(labels=labels, predictions=[])
         assert len(result) == 1
         assert result["result"] == 0.0
+
+
+class TestStatisticalEvaluatorMRR:
+    def test_run(self):
+        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
+        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
+        predictions = [
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+            "The Eiffel Tower max height is 330 meters.",
+            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
+            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
+        ]
+        result = evaluator.run(labels=labels, predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == 1 / 3
+
+    def test_run_with_empty_labels(self):
+        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
+        predictions = [
+            "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.",
+            "The Eiffel Tower max height is 330 meters.",
+            "Louvre Museum is the world's largest art museum and a historic monument in Paris, France.",
+            "The Leaning Tower of Pisa is the campanile, or freestanding bell tower, of Pisa Cathedral.",
+        ]
+        result = evaluator.run(labels=[], predictions=predictions)
+        assert len(result) == 1
+        assert result["result"] == 0.0
+
+    def test_run_with_empty_predictions(self):
+        evaluator = StatisticalEvaluator(metric=StatisticalMetric.MRR)
+        labels = ["Eiffel Tower", "Louvre Museum", "Colosseum", "Trajan's Column"]
+        result = evaluator.run(labels=labels, predictions=[])
+        assert len(result) == 1
+        assert result["result"] == 0.0