feat: Add calculate_metrics and MetricsResult (#6680)

* Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <[email protected]> Co-authored-by: Silvano Cerza <[email protected]>
deepset-ai · Jan 10, 2024 · 374a937 · 374a937
1 parent e6d6ce1
commit 374a937
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 9 deletions.
diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py
@@ -1,12 +1,15 @@
+import json
+
 from haystack import Pipeline
 from haystack.components.readers import ExtractiveReader
 from haystack.components.retrievers import InMemoryBM25Retriever
 from haystack.dataclasses import Document, ExtractedAnswer
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.evaluation.eval import eval
+from haystack.evaluation.metrics import Metric
 
 
-def test_extractive_qa_pipeline():
+def test_extractive_qa_pipeline(tmp_path):
     # Create the pipeline
     qa_pipeline = Pipeline()
     qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
@@ -123,3 +126,11 @@ def test_extractive_qa_pipeline():
     assert eval_result.expected_outputs == expected_outputs
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()
+
+    metrics = eval_result.calculate_metrics(Metric.EM)
+    # Save metric results to json
+    metrics.save(tmp_path / "exact_match_score.json")
+
+    assert metrics["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics == json.load(f)
diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py
@@ -1,3 +1,5 @@
+import json
+
 from haystack import Pipeline
 from haystack.components.builders.answer_builder import AnswerBuilder
 from haystack.components.builders.prompt_builder import PromptBuilder
@@ -8,9 +10,10 @@
 from haystack.dataclasses import Document
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.evaluation.eval import eval
+from haystack.evaluation.metrics import Metric
 
 
-def test_bm25_rag_pipeline():
+def test_bm25_rag_pipeline(tmp_path):
     prompt_template = """
     Given these documents, answer the question.\nDocuments:
     {% for doc in documents %}
@@ -68,8 +71,16 @@ def test_bm25_rag_pipeline():
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
+    metrics = eval_result.calculate_metrics(Metric.EM)
+    # Save metric results to json
+    metrics.save(tmp_path / "exact_match_score.json")
+
+    assert metrics["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics == json.load(f)
+
 
-def test_embedding_retrieval_rag_pipeline():
+def test_embedding_retrieval_rag_pipeline(tmp_path):
     # Create the RAG pipeline
     prompt_template = """
     Given these documents, answer the question.\nDocuments:
@@ -143,3 +154,11 @@ def test_embedding_retrieval_rag_pipeline():
     assert eval_result.expected_outputs == expected_outputs
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
+
+    metrics = eval_result.calculate_metrics(Metric.EM)
+    # Save metric results to json
+    metrics.save(tmp_path / "exact_match_score.json")
+
+    assert metrics["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics == json.load(f)
diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py
@@ -1,3 +1,4 @@
 from haystack.evaluation.eval import EvaluationResult, eval
+from haystack.evaluation.metrics import Metric, MetricsResult
 
-__all__ = ["eval", "EvaluationResult"]
+__all__ = ["eval", "EvaluationResult", "Metric", "MetricsResult"]
diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
@@ -1,12 +1,14 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Callable, Dict, List, Union
 
 from haystack import Pipeline
 from haystack.core.component import Component
+from haystack.evaluation.metrics import Metric, MetricsResult
 
 
 class EvaluationResult:
     """
-    EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs.
+    EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or
+    component), inputs, outputs, and expected outputs.
     The EvaluationResult keeps track of all the information stored by eval.
 
     :param runnable: The runnable (Pipeline or component) used for evaluation.
@@ -27,6 +29,48 @@ def __init__(
         self.outputs = outputs
         self.expected_outputs = expected_outputs
 
+        # Mapping of metrics to their corresponding functions.
+        # This should be kept in sync with the Metric enum
+        self._supported_metrics = {
+            Metric.RECALL: self._calculate_recall,
+            Metric.MRR: self._calculate_mrr,
+            Metric.MAP: self._calculate_map,
+            Metric.F1: self._calculate_f1,
+            Metric.EM: self._calculate_em,
+            Metric.SAS: self._calculate_sas,
+        }
+
+    def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
+        """
+        Calculate evaluation metrics based on the provided Metric or using the custom metric function.
+
+        :param metric: The Metric indicating the type of metric to calculate or custom function to compute.
+        :return: MetricsResult containing the calculated metric.
+        """
+
+        if isinstance(metric, Metric):
+            return self._supported_metrics[metric](**kwargs)
+
+        return metric(self, **kwargs)
+
+    def _calculate_recall(self):
+        return MetricsResult({"recall": None})
+
+    def _calculate_map(self):
+        return MetricsResult({"mean_average_precision": None})
+
+    def _calculate_mrr(self):
+        return MetricsResult({"mean_reciprocal_rank": None})
+
+    def _calculate_f1(self):
+        return MetricsResult({"f1": None})
+
+    def _calculate_em(self):
+        return MetricsResult({"exact_match": 1.0})
+
+    def _calculate_sas(self):
+        return MetricsResult({"exact_match": None})
+
 
 def eval(
     runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
@@ -41,16 +85,17 @@ def eval(
     :param inputs: List of inputs used for evaluation.
     :param expected_outputs: List of expected outputs used for evaluation.
 
-    :return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs.
+    :return: An instance of EvaluationResult containing information about the evaluation, including the runnable,
+    inputs, outputs, and expected outputs.
     """
 
     outputs = []
 
     # Check that expected outputs has the correct shape
     if len(inputs) != len(expected_outputs):
         raise ValueError(
-            f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). "
-            " Please ensure that each input has a corresponding expected output."
+            f"The number of inputs ({len(inputs)}) does not match the number of expected outputs "
+            f"({len(expected_outputs)}). Please ensure that each input has a corresponding expected output."
         )
 
     for input_ in inputs:

diff --git a/haystack/evaluation/metrics.py b/haystack/evaluation/metrics.py
@@ -0,0 +1,32 @@
+import json
+from enum import Enum
+from pathlib import Path
+from typing import Union
+
+
+class Metric(Enum):
+    """
+    Contains a list of standard metrics available.
+    """
+
+    RECALL = "Recall"
+    MRR = "Mean Reciprocal Rank"
+    MAP = "Mean Average Precision"
+    F1 = "F1"
+    EM = "Exact Match"
+    SAS = "Semantic Answer Similarity"
+
+
+class MetricsResult(dict):
+    """
+    Stores the metric values computed during the evaluation.
+    """
+
+    def save(self, file: Union[str, Path]):
+        """
+        Save the metrics stored in the MetricsResult to a json file.
+
+        :param file: The file path or file name to save the data.
+        """
+        with open(file, "w") as outfile:
+            json.dump(self, outfile, indent=4)
diff --git a/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml b/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics.
+    Adds `Metric` class to store list of available metrics.
+    Adds `MetricsResult` class to store the metric values computed during the evaluation.
diff --git a/test/evaluation/test_eval.py b/test/evaluation/test_eval.py
@@ -0,0 +1,42 @@
+from unittest.mock import MagicMock
+
+from haystack.core.pipeline import Pipeline
+from haystack.evaluation.eval import EvaluationResult
+from haystack.evaluation.metrics import Metric
+
+
+class TestEvaluationResult:
+    def test_init(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        assert result.runnable == runnable
+        assert result.inputs == []
+        assert result.outputs == []
+        assert result.expected_outputs == []
+
+    def test_supported_metrics_contains_all_metrics(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        supported_metrics = [m.name for m in result._supported_metrics.keys()]
+        all_metric_names = [m.name for m in Metric]
+        assert supported_metrics == all_metric_names
+
+    def test_calculate_metrics_with_supported_metric(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+        result._supported_metrics[Metric.RECALL] = MagicMock()
+        result.calculate_metrics(metric=Metric.RECALL)
+
+        assert result._supported_metrics[Metric.RECALL].called_once_with()
+
+    def test_calculate_metrics_with_non_supported_metric(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        unsupported_metric = MagicMock()
+
+        result.calculate_metrics(metric=unsupported_metric, some_argument="some_value")
+
+        assert unsupported_metric.called_once_with(some_argument="some_value")