From cab6583def7e6834b0b0214680953495a5a7c02b Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Wed, 3 Jan 2024 19:56:56 +0530
Subject: [PATCH 01/10] Add calculate_metrics, MetricsResult, Exact Match

---
 .../test_eval_extractive_qa_pipeline.py       |  35 ++---
 e2e/pipelines/test_eval_rag_pipelines.py      | 135 ++++++++++++++++--
 haystack/evaluation/__init__.py               |   3 +-
 haystack/evaluation/eval.py                   |  95 +++++++++++-
 haystack/evaluation/eval_utils.py             |  32 +++++
 haystack/evaluation/metrics.py                |  28 ++++
 6 files changed, 301 insertions(+), 27 deletions(-)
 create mode 100644 haystack/evaluation/eval_utils.py
 create mode 100644 haystack/evaluation/metrics.py

diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py
index 201a84e139..b9be2f2752 100644
--- a/e2e/pipelines/test_eval_extractive_qa_pipeline.py
+++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py
@@ -1,12 +1,15 @@
+import json
+
 from haystack import Pipeline
 from haystack.components.readers import ExtractiveReader
 from haystack.components.retrievers import InMemoryBM25Retriever
 from haystack.dataclasses import Document, ExtractedAnswer
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.evaluation.eval import eval
+from haystack.evaluation.metrics import Metric
 
 
-def test_extractive_qa_pipeline():
+def test_extractive_qa_pipeline(tmp_path):
     # Create the pipeline
     qa_pipeline = Pipeline()
     qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
@@ -32,11 +35,7 @@ def test_extractive_qa_pipeline():
                         query="Who lives in Paris?",
                         score=0.7713339924812317,
                         data="Jean and I",
-                        document=Document(
-                            id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e",
-                            content="My name is Jean and I live in Paris.",
-                            score=0.33144005810482535,
-                        ),
+                        document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=21),
                         context_offset=None,
@@ -62,11 +61,7 @@ def test_extractive_qa_pipeline():
                         query="Who lives in Berlin?",
                         score=0.7047999501228333,
                         data="Mark and I",
-                        document=Document(
-                            id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20",
-                            content="My name is Mark and I live in Berlin.",
-                            score=0.33144005810482535,
-                        ),
+                        document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=21),
                         context_offset=None,
@@ -92,11 +87,7 @@ def test_extractive_qa_pipeline():
                         query="Who lives in Rome?",
                         score=0.7661304473876953,
                         data="Giorgio and I",
-                        document=Document(
-                            id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27",
-                            content="My name is Giorgio and I live in Rome.",
-                            score=0.33144005810482535,
-                        ),
+                        document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=24),
                         context_offset=None,
@@ -123,3 +114,15 @@ def test_extractive_qa_pipeline():
     assert eval_result.expected_outputs == expected_outputs
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()
+
+    metrics_default = eval_result.calculate_metrics(Metric.EM)
+    metrics_custom_parameters = eval_result.calculate_metrics(
+        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save metric results to json
+    metrics_default.save(tmp_path / "exact_match_score.json")
+
+    assert metrics_default["exact_match"] == 1.0
+    assert metrics_custom_parameters["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics_default == json.load(f)
diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py
index 7ff365dfad..5fe4716f57 100644
--- a/e2e/pipelines/test_eval_rag_pipelines.py
+++ b/e2e/pipelines/test_eval_rag_pipelines.py
@@ -1,3 +1,5 @@
+import json
+
 from haystack import Pipeline
 from haystack.components.builders.answer_builder import AnswerBuilder
 from haystack.components.builders.prompt_builder import PromptBuilder
@@ -5,12 +7,13 @@
 from haystack.components.generators import HuggingFaceLocalGenerator
 from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
 from haystack.components.writers import DocumentWriter
-from haystack.dataclasses import Document
+from haystack.dataclasses import Document, GeneratedAnswer
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.evaluation.eval import eval
+from haystack.evaluation.metrics import Metric
 
 
-def test_bm25_rag_pipeline():
+def test_bm25_rag_pipeline(tmp_path):
     prompt_template = """
     Given these documents, answer the question.\nDocuments:
     {% for doc in documents %}
@@ -56,9 +59,54 @@ def test_bm25_rag_pipeline():
     ]
 
     expected_outputs = [
-        {"llm": {"replies": ["Jean"]}},
-        {"llm": {"replies": ["Mark"]}},
-        {"llm": {"replies": ["Giorgio"]}},
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Jean",
+                        query="Who lives in Paris?",
+                        documents=[
+                            Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
+                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
+                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Mark",
+                        query="Who lives in Berlin?",
+                        documents=[
+                            Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
+                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
+                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Giorgio",
+                        query="Who lives in Rome?",
+                        documents=[
+                            Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
+                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
+                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
     ]
 
     eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
@@ -68,8 +116,20 @@ def test_bm25_rag_pipeline():
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
+    metrics_default = eval_result.calculate_metrics(Metric.EM)
+    metrics_custom_parameters = eval_result.calculate_metrics(
+        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save metric results to json
+    metrics_default.save(tmp_path / "exact_match_score.json")
+
+    assert metrics_default["exact_match"] == 1.0
+    assert metrics_custom_parameters["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics_default == json.load(f)
+
 
-def test_embedding_retrieval_rag_pipeline():
+def test_embedding_retrieval_rag_pipeline(tmp_path):
     # Create the RAG pipeline
     prompt_template = """
     Given these documents, answer the question.\nDocuments:
@@ -132,9 +192,54 @@ def test_embedding_retrieval_rag_pipeline():
     ]
 
     expected_outputs = [
-        {"llm": {"replies": ["Jean"]}},
-        {"llm": {"replies": ["Mark"]}},
-        {"llm": {"replies": ["Giorgio"]}},
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Jean",
+                        query="Who lives in Paris?",
+                        documents=[
+                            Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
+                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
+                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Mark",
+                        query="Who lives in Berlin?",
+                        documents=[
+                            Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
+                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
+                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
+        {
+            "answer_builder": {
+                "answers": [
+                    GeneratedAnswer(
+                        data="Giorgio",
+                        query="Who lives in Rome?",
+                        documents=[
+                            Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
+                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
+                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
+                        ],
+                        meta={},
+                    )
+                ]
+            }
+        },
     ]
 
     eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
@@ -143,3 +248,15 @@ def test_embedding_retrieval_rag_pipeline():
     assert eval_result.expected_outputs == expected_outputs
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
+
+    metrics_default = eval_result.calculate_metrics(Metric.EM)
+    metrics_custom_parameters = eval_result.calculate_metrics(
+        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save metric results to json
+    metrics_default.save(tmp_path / "exact_match_score.json")
+
+    assert metrics_default["exact_match"] == 1.0
+    assert metrics_custom_parameters["exact_match"] == 1.0
+    with open(tmp_path / "exact_match_score.json", "r") as f:
+        assert metrics_default == json.load(f)
diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py
index 090aadc2cd..7313d1efea 100644
--- a/haystack/evaluation/__init__.py
+++ b/haystack/evaluation/__init__.py
@@ -1,3 +1,4 @@
 from haystack.evaluation.eval import EvaluationResult, eval
+from haystack.evaluation.metrics import Metric, MetricsResult
 
-__all__ = ["eval", "EvaluationResult"]
+__all__ = ["eval", "EvaluationResult", "Metric", "MetricsResult"]
diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 0f9f1e8792..c2c77c11fd 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -1,7 +1,13 @@
-from typing import Any, Dict, List, Union
+import re
+import string
+from typing import Any, Callable, Dict, List, Union
+
+import numpy as np
 
 from haystack import Pipeline
 from haystack.core.component import Component
+from haystack.evaluation.eval_utils import get_answers_from_output
+from haystack.evaluation.metrics import Metric, MetricsResult
 
 
 class EvaluationResult:
@@ -27,6 +33,93 @@ def __init__(
         self.outputs = outputs
         self.expected_outputs = expected_outputs
 
+        # Determine the type of the runnable
+        if str(type(runnable).__name__) == "Pipeline":
+            self.runnable_type = "pipeline"
+        else:
+            self.runnable_type = "component"
+
+    # pylint: disable=too-many-return-statements
+    def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
+        """
+        Calculate evaluation metrics based on the provided Metric or using the custom metric function.
+
+        :param metric: The Metric indicating the type of metric to calculate or custom function to compute.
+        :return: MetricsResult containing the calculated metric.
+        """
+        if metric == Metric.RECALL:
+            return self._calculate_recall(**kwargs)
+
+        elif metric == Metric.F1:
+            return self._calculate_f1(**kwargs)
+
+        elif metric == Metric.MRR:
+            return self._calculate_mrr(**kwargs)
+
+        elif metric == Metric.MAP:
+            return self._calculate_map(**kwargs)
+
+        elif metric == Metric.EM:
+            predictions = get_answers_from_output(self.outputs, self.runnable_type)
+            labels = get_answers_from_output(self.expected_outputs, self.runnable_type)
+            return self._calculate_em(predictions=predictions, labels=labels, **kwargs)
+
+        elif metric == Metric.SAS:
+            return self._calculate_sas(**kwargs)
+
+        return metric(self, **kwargs)
+
+    def _calculate_recall(self):
+        return MetricsResult({"recall": None})
+
+    def _calculate_map(self):
+        return MetricsResult({"mean_average_precision": None})
+
+    def _calculate_mrr(self):
+        return MetricsResult({"mean_reciprocal_rank": None})
+
+    def _calculate_f1(self):
+        return MetricsResult({"f1": None})
+
+    def _calculate_em(
+        self,
+        predictions,
+        labels,
+        regexes_to_ignore=None,
+        ignore_case=False,
+        ignore_punctuation=False,
+        ignore_numbers=False,
+    ):
+        if regexes_to_ignore is not None:
+            for s in regexes_to_ignore:
+                predictions = np.array([re.sub(s, "", x) for x in predictions])
+                labels = np.array([re.sub(s, "", x) for x in labels])
+        else:
+            predictions = np.asarray(predictions)
+            labels = np.asarray(labels)
+
+        if ignore_case:
+            predictions = np.char.lower(predictions)
+            labels = np.char.lower(labels)
+
+        if ignore_punctuation:
+            repl_table = string.punctuation.maketrans("", "", string.punctuation)
+            predictions = np.char.translate(predictions, table=repl_table)
+            labels = np.char.translate(labels, table=repl_table)
+
+        if ignore_numbers:
+            repl_table = string.digits.maketrans("", "", string.digits)
+            predictions = np.char.translate(predictions, table=repl_table)
+            labels = np.char.translate(labels, table=repl_table)
+
+        score_list = predictions == labels
+        em = np.mean(score_list)
+        return MetricsResult({"exact_match": em})
+
+    def _calculate_sas(self):
+        val = 0
+        return MetricsResult({"exact_match": val})
+
 
 def eval(
     runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
diff --git a/haystack/evaluation/eval_utils.py b/haystack/evaluation/eval_utils.py
new file mode 100644
index 0000000000..b03816a3de
--- /dev/null
+++ b/haystack/evaluation/eval_utils.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List
+
+
+def get_answers_from_output(outputs: List[Dict[str, Any]], runnable_type: str) -> List[str]:
+    """
+    Extracts the answers from the output of a pipeline or component.
+
+    :param outputs: The outputs of the runnable.
+    :return: List of answers from the runnable output.
+    """
+    answers = []
+    if runnable_type == "pipeline":
+        # Iterate over output from each Pipeline run
+        for output in outputs:
+            # Iterate over output of component in each Pipeline run
+            for component_output in output.values():
+                # Only extract answers
+                for key in component_output.keys():
+                    if "answers" in key:
+                        for generated_answer in component_output["answers"]:
+                            if generated_answer.data:
+                                answers.append(generated_answer.data)
+    else:
+        # Iterate over output from each Component run
+        for output in outputs:
+            # Only extract answers
+            for key in output.keys():
+                if "answers" in key:
+                    for generated_answer in output["answers"]:
+                        if generated_answer.data:
+                            answers.append(generated_answer.data)
+    return answers
diff --git a/haystack/evaluation/metrics.py b/haystack/evaluation/metrics.py
new file mode 100644
index 0000000000..483bf0eb56
--- /dev/null
+++ b/haystack/evaluation/metrics.py
@@ -0,0 +1,28 @@
+import json
+from enum import Enum
+from pathlib import Path
+from typing import Union
+
+
+class Metric(Enum):
+    """
+    Contains a list of standard metrics available.
+    """
+
+    RECALL = "Recall"
+    MRR = "Mean Reciprocal Rank"
+    MAP = "Mean Average Precision"
+    F1 = "F1"
+    EM = "Exact Match"
+    SAS = "Semantic Answer Similarity"
+
+
+class MetricsResult(dict):
+    def save(self, file: Union[str, Path]):
+        """
+        Save the metrics stored in the MetricsResult to a json file.
+
+        :param file: The file path or file name to save the data.
+        """
+        with open(file, "w") as outfile:
+            json.dump(self, outfile, indent=4)

From 70454b769c26ff0fb836642f6e7ff2941e2f4690 Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:27:11 +0530
Subject: [PATCH 02/10] Add additional tests for metric calculation

---
 haystack/evaluation/eval.py              |  16 ++-
 haystack/evaluation/metrics.py           |   4 +
 test/evaluation/__init__.py              |   0
 test/evaluation/test_eval_exact_match.py | 142 +++++++++++++++++++++++
 test/evaluation/test_eval_utils.py       | 125 ++++++++++++++++++++
 5 files changed, 283 insertions(+), 4 deletions(-)
 create mode 100644 test/evaluation/__init__.py
 create mode 100644 test/evaluation/test_eval_exact_match.py
 create mode 100644 test/evaluation/test_eval_utils.py

diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index c2c77c11fd..e795975c6b 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -12,7 +12,8 @@
 
 class EvaluationResult:
     """
-    EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs.
+    EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or
+    component), inputs, outputs, and expected outputs.
     The EvaluationResult keeps track of all the information stored by eval.
 
     :param runnable: The runnable (Pipeline or component) used for evaluation.
@@ -90,6 +91,12 @@ def _calculate_em(
         ignore_punctuation=False,
         ignore_numbers=False,
     ):
+        if len(predictions) != len(labels):
+            raise ValueError("The number of predictions and labels must be the same.")
+        if len(predictions) == len(labels) == 0:
+            # Return Exact Match as 0 for no inputs
+            return MetricsResult({"exact_match": 0.0})
+
         if regexes_to_ignore is not None:
             for s in regexes_to_ignore:
                 predictions = np.array([re.sub(s, "", x) for x in predictions])
@@ -134,7 +141,8 @@ def eval(
     :param inputs: List of inputs used for evaluation.
     :param expected_outputs: List of expected outputs used for evaluation.
 
-    :return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs.
+    :return: An instance of EvaluationResult containing information about the evaluation, including the runnable,
+    inputs, outputs, and expected outputs.
     """
 
     outputs = []
@@ -142,8 +150,8 @@ def eval(
     # Check that expected outputs has the correct shape
     if len(inputs) != len(expected_outputs):
         raise ValueError(
-            f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). "
-            " Please ensure that each input has a corresponding expected output."
+            f"The number of inputs ({len(inputs)}) does not match the number of expected outputs "
+            f"({len(expected_outputs)}). Please ensure that each input has a corresponding expected output."
         )
 
     for input_ in inputs:
diff --git a/haystack/evaluation/metrics.py b/haystack/evaluation/metrics.py
index 483bf0eb56..fbe2fec8af 100644
--- a/haystack/evaluation/metrics.py
+++ b/haystack/evaluation/metrics.py
@@ -18,6 +18,10 @@ class Metric(Enum):
 
 
 class MetricsResult(dict):
+    """
+    Stores the metric values computed during the evaluation.
+    """
+
     def save(self, file: Union[str, Path]):
         """
         Save the metrics stored in the MetricsResult to a json file.
diff --git a/test/evaluation/__init__.py b/test/evaluation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py
new file mode 100644
index 0000000000..e7e77e8b56
--- /dev/null
+++ b/test/evaluation/test_eval_exact_match.py
@@ -0,0 +1,142 @@
+import numpy as np
+import pytest
+
+from haystack.evaluation.eval import EvaluationResult
+
+
+# Define test cases for _calculate_em function
+class TestExactMatch:
+    @pytest.fixture
+    def evaluation_result(self):
+        runnable = None
+        inputs = []
+        outputs = []
+        expected_outputs = []
+        eval_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
+        return eval_result
+
+    def test_exact_match(self, evaluation_result):
+        """
+        Test exact match with default parameters
+        """
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        em_result = evaluation_result._calculate_em(predictions, labels)
+
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_empty_inputs(self, evaluation_result):
+        """
+        Test exact match with empty inputs
+        """
+        predictions = []
+        labels = []
+        # Expecting 0% exact match for empty inputs
+        em_result = evaluation_result._calculate_em(predictions, labels)
+        assert em_result["exact_match"] == 0.0
+
+    def test_exact_match_different_data_types(self, evaluation_result):
+        """
+        Test exact match with different data types (numpy arrays)
+        """
+        predictions = np.array(["OpenSource", "HaystackAI", "LLMs"])
+        labels = np.array(["OpenSource", "HaystackAI", "LLMs"])
+
+        em_result = evaluation_result._calculate_em(predictions, labels)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_single_word(self, evaluation_result):
+        """
+        Test exact match with single-word inputs
+        """
+        predictions = ["OpenSource"]
+        labels = ["OpenSource"]
+
+        em_result = evaluation_result._calculate_em(predictions, labels)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_negative_case(self, evaluation_result):
+        """
+        Test exact match with deliberately mismatched predictions and labels
+        """
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        labels = ["Source", "HaystackAI", "LLMs"]
+        expected_em = 2 / 3  # Expecting EM to be 2/3 as 2 out of 3 items match
+        em_result = evaluation_result._calculate_em(predictions, labels)
+        assert em_result["exact_match"] == expected_em
+
+    def test_exact_match_ignore_case(self, evaluation_result):
+        """
+        Test exact match with ignoring case sensitivity
+        """
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        labels = ["opensource", "HAYSTACKAI", "llMs"]
+
+        # Exact match after case ignoring
+        em_result = evaluation_result._calculate_em(predictions, labels, ignore_case=True)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_ignore_punctuation(self, evaluation_result):
+        """
+        Test exact match with ignoring punctuation
+        """
+        predictions = ["OpenSource!", "Haystack.AI", "LLMs,"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+
+        # Exact match after ignoring punctuation
+        em_result = evaluation_result._calculate_em(predictions, labels, ignore_punctuation=True)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_ignore_numbers(self, evaluation_result):
+        """
+        Test exact match with ignoring numbers
+        """
+        predictions = ["OpenSource123", "HaystackAI", "LLMs456"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+
+        # Exact match after ignoring numbers
+        em_result = evaluation_result._calculate_em(predictions, labels, ignore_numbers=True)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_regex_ignore(self, evaluation_result):
+        """
+        Test exact match with ignoring specific regex patterns
+        """
+        predictions = ["Open123Source", "HaystackAI", "LLMs456"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+
+        # Ignore numeric patterns
+        regex_to_ignore = [r"\d+"]
+        em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_multiple_ignore_regex(self, evaluation_result):
+        """
+        Test exact match with multiple ignoring parameters
+        """
+        predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+
+        # Ignore numeric patterns and punctuation using regex
+        regex_to_ignore = [r"\d+", r"\W+"]
+        em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore)
+        assert em_result["exact_match"] == 1.0
+
+    def test_exact_match_multiple_ignore_combination(self, evaluation_result):
+        """
+        Test exact match with multiple ignoring parameters combined
+        """
+        predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+
+        # Ignore only special characters using regex
+        regex_to_ignore = [r"[^\w\s\d]+"]
+        em_result = evaluation_result._calculate_em(
+            predictions,
+            labels,
+            ignore_numbers=True,
+            ignore_punctuation=True,
+            ignore_case=True,
+            regexes_to_ignore=regex_to_ignore,
+        )
+        assert em_result["exact_match"] == 1.0
diff --git a/test/evaluation/test_eval_utils.py b/test/evaluation/test_eval_utils.py
new file mode 100644
index 0000000000..3c67aeb579
--- /dev/null
+++ b/test/evaluation/test_eval_utils.py
@@ -0,0 +1,125 @@
+from haystack.dataclasses import GeneratedAnswer
+from haystack.evaluation.eval_utils import get_answers_from_output
+
+
+class TestEvalUtils:
+    def test_extract_answers_from_pipeline_output(self):
+        """
+        Test that the function correctly extracts answers from the output of a pipeline.
+        """
+        outputs = [
+            {
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
+                }
+            },
+            {
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
+                }
+            },
+            {
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]
+                }
+            },
+        ]
+
+        runnable_type = "pipeline"
+        expected_answers = ["Jean", "Mark", "Giorgio"]
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers
+
+    def test_extract_answers_from_component_output(self):
+        """
+        Test that the function correctly extracts answers from the output of a component.
+        """
+        outputs = [
+            {"answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]},
+            {"answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]},
+            {"answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]},
+        ]
+        runnable_type = "component"
+        expected_answers = ["Jean", "Mark", "Giorgio"]
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers
+
+    def test_ignore_other_output_keys(self):
+        """
+        Test that the function only extracts answers and ignores other output keys.
+        """
+        outputs = [
+            {
+                "llm": {"replies": ["llm_reply_1"]},
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
+                },
+            },
+            {
+                "llm": {"replies": ["llm_reply_2"]},
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
+                },
+            },
+            {
+                "llm": {"replies": ["llm_reply_3"]},
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]
+                },
+            },
+        ]
+
+        runnable_type = "pipeline"
+        expected_answers = ["Jean", "Mark", "Giorgio"]
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers
+
+    def test_handle_empty_outputs(self):
+        """
+        Test that the function correctly handles empty outputs.
+        """
+        outputs = []
+        runnable_type = "pipeline"
+        expected_answers = []
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers
+
+    def test_handle_missing_keys(self):
+        """
+        Test that the function correctly handles outputs with missing keys.
+        """
+        outputs = [
+            {
+                "llm": {"replies": ["llm_reply_1"]},
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
+                },
+            },
+            {
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
+                }
+            },
+        ]
+
+        runnable_type = "pipeline"
+        expected_answers = ["Jean", "Mark"]
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers
+
+    def test_handle_missing_values(self):
+        """
+        Test that the function correctly handles outputs with missing values.
+        """
+        outputs = [
+            {"answer_builder": {"answers": []}},
+            {
+                "answer_builder": {
+                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
+                }
+            },
+        ]
+        runnable_type = "pipeline"
+        expected_answers = ["Mark"]
+
+        assert get_answers_from_output(outputs, runnable_type) == expected_answers

From 3a224b33a4eed5afe73e822c084e7e10751f014c Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:27:36 +0530
Subject: [PATCH 03/10] Add release notes

---
 ...etrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml

diff --git a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml b/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml
new file mode 100644
index 0000000000..920eb88d34
--- /dev/null
+++ b/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml
@@ -0,0 +1,7 @@
+---
+features:
+  - |
+    Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics.
+    Adds `Metric` class to store list of available metrics.
+    Adds `MetricsResult` class to store the metric values computed during the evaluation.
+    Adds a function to EvaluationResult for calculating the Exact Match metric.

From 416d609524a3eaad3b67ebb3039e55d4de888ca0 Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:38:51 +0530
Subject: [PATCH 04/10] Add docstring for Exact Match metric

---
 haystack/evaluation/eval.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index e795975c6b..77e4bca25b 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -91,6 +91,24 @@ def _calculate_em(
         ignore_punctuation=False,
         ignore_numbers=False,
     ):
+        """
+        Calculates the Exact Match (EM) score between two lists of predictions and labels.
+        Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the
+          corresponding ground truth label.
+
+        :param predictions: A list of predicted text strings.
+        :param labels (list): A list of ground truth (reference) text strings.
+        :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
+            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
+        :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
+        :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
+            comparison. Defaults to False.
+        :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
+            before comparison. Defaults to False.
+
+        :return: A MetricsResult object containing the calculated Exact Match (EM) score.
+        """
+
         if len(predictions) != len(labels):
             raise ValueError("The number of predictions and labels must be the same.")
         if len(predictions) == len(labels) == 0:

From 082e5c0c57d3269b717293696c3a3915bd11c138 Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Mon, 8 Jan 2024 13:56:48 +0530
Subject: [PATCH 05/10] Remove Exact Match Implementation

---
 .../test_eval_extractive_qa_pipeline.py       |  30 ++--
 e2e/pipelines/test_eval_rag_pipelines.py      | 128 ++--------------
 haystack/evaluation/eval.py                   |  72 +--------
 haystack/evaluation/eval_utils.py             |  32 ----
 test/evaluation/__init__.py                   |   0
 test/evaluation/test_eval_exact_match.py      | 142 ------------------
 test/evaluation/test_eval_utils.py            | 125 ---------------
 7 files changed, 38 insertions(+), 491 deletions(-)
 delete mode 100644 haystack/evaluation/eval_utils.py
 delete mode 100644 test/evaluation/__init__.py
 delete mode 100644 test/evaluation/test_eval_exact_match.py
 delete mode 100644 test/evaluation/test_eval_utils.py

diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py
index b9be2f2752..c0e171af18 100644
--- a/e2e/pipelines/test_eval_extractive_qa_pipeline.py
+++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py
@@ -35,7 +35,11 @@ def test_extractive_qa_pipeline(tmp_path):
                         query="Who lives in Paris?",
                         score=0.7713339924812317,
                         data="Jean and I",
-                        document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
+                        document=Document(
+                            id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e",
+                            content="My name is Jean and I live in Paris.",
+                            score=0.33144005810482535,
+                        ),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=21),
                         context_offset=None,
@@ -61,7 +65,11 @@ def test_extractive_qa_pipeline(tmp_path):
                         query="Who lives in Berlin?",
                         score=0.7047999501228333,
                         data="Mark and I",
-                        document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
+                        document=Document(
+                            id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20",
+                            content="My name is Mark and I live in Berlin.",
+                            score=0.33144005810482535,
+                        ),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=21),
                         context_offset=None,
@@ -87,7 +95,11 @@ def test_extractive_qa_pipeline(tmp_path):
                         query="Who lives in Rome?",
                         score=0.7661304473876953,
                         data="Giorgio and I",
-                        document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
+                        document=Document(
+                            id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27",
+                            content="My name is Giorgio and I live in Rome.",
+                            score=0.33144005810482535,
+                        ),
                         context=None,
                         document_offset=ExtractedAnswer.Span(start=11, end=24),
                         context_offset=None,
@@ -115,14 +127,10 @@ def test_extractive_qa_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM)
-    metrics_custom_parameters = eval_result.calculate_metrics(
-        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
-    )
+    metrics = eval_result.calculate_metrics(Metric.EM)
     # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    metrics.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert metrics["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert metrics == json.load(f)
diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py
index 5fe4716f57..68251d328b 100644
--- a/e2e/pipelines/test_eval_rag_pipelines.py
+++ b/e2e/pipelines/test_eval_rag_pipelines.py
@@ -7,7 +7,7 @@
 from haystack.components.generators import HuggingFaceLocalGenerator
 from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
 from haystack.components.writers import DocumentWriter
-from haystack.dataclasses import Document, GeneratedAnswer
+from haystack.dataclasses import Document
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.evaluation.eval import eval
 from haystack.evaluation.metrics import Metric
@@ -59,54 +59,9 @@ def test_bm25_rag_pipeline(tmp_path):
     ]
 
     expected_outputs = [
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Jean",
-                        query="Who lives in Paris?",
-                        documents=[
-                            Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
-                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
-                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Mark",
-                        query="Who lives in Berlin?",
-                        documents=[
-                            Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
-                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
-                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Giorgio",
-                        query="Who lives in Rome?",
-                        documents=[
-                            Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
-                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
-                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
+        {"llm": {"replies": ["Jean"]}},
+        {"llm": {"replies": ["Mark"]}},
+        {"llm": {"replies": ["Giorgio"]}},
     ]
 
     eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
@@ -116,17 +71,13 @@ def test_bm25_rag_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM)
-    metrics_custom_parameters = eval_result.calculate_metrics(
-        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
-    )
+    metrics = eval_result.calculate_metrics(Metric.EM)
     # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    metrics.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert metrics["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert metrics == json.load(f)
 
 
 def test_embedding_retrieval_rag_pipeline(tmp_path):
@@ -192,54 +143,9 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
     ]
 
     expected_outputs = [
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Jean",
-                        query="Who lives in Paris?",
-                        documents=[
-                            Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
-                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
-                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Mark",
-                        query="Who lives in Berlin?",
-                        documents=[
-                            Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
-                            Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
-                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
-        {
-            "answer_builder": {
-                "answers": [
-                    GeneratedAnswer(
-                        data="Giorgio",
-                        query="Who lives in Rome?",
-                        documents=[
-                            Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
-                            Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
-                            Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
-                        ],
-                        meta={},
-                    )
-                ]
-            }
-        },
+        {"llm": {"replies": ["Jean"]}},
+        {"llm": {"replies": ["Mark"]}},
+        {"llm": {"replies": ["Giorgio"]}},
     ]
 
     eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
@@ -249,14 +155,10 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM)
-    metrics_custom_parameters = eval_result.calculate_metrics(
-        Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True
-    )
+    metrics = eval_result.calculate_metrics(Metric.EM)
     # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    metrics.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert metrics["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert metrics == json.load(f)
diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 77e4bca25b..9aee19f892 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -1,12 +1,7 @@
-import re
-import string
 from typing import Any, Callable, Dict, List, Union
 
-import numpy as np
-
 from haystack import Pipeline
 from haystack.core.component import Component
-from haystack.evaluation.eval_utils import get_answers_from_output
 from haystack.evaluation.metrics import Metric, MetricsResult
 
 
@@ -61,9 +56,7 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]],
             return self._calculate_map(**kwargs)
 
         elif metric == Metric.EM:
-            predictions = get_answers_from_output(self.outputs, self.runnable_type)
-            labels = get_answers_from_output(self.expected_outputs, self.runnable_type)
-            return self._calculate_em(predictions=predictions, labels=labels, **kwargs)
+            return self._calculate_em(**kwargs)
 
         elif metric == Metric.SAS:
             return self._calculate_sas(**kwargs)
@@ -82,68 +75,11 @@ def _calculate_mrr(self):
     def _calculate_f1(self):
         return MetricsResult({"f1": None})
 
-    def _calculate_em(
-        self,
-        predictions,
-        labels,
-        regexes_to_ignore=None,
-        ignore_case=False,
-        ignore_punctuation=False,
-        ignore_numbers=False,
-    ):
-        """
-        Calculates the Exact Match (EM) score between two lists of predictions and labels.
-        Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the
-          corresponding ground truth label.
-
-        :param predictions: A list of predicted text strings.
-        :param labels (list): A list of ground truth (reference) text strings.
-        :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
-            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
-        :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
-        :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
-            comparison. Defaults to False.
-        :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
-            before comparison. Defaults to False.
-
-        :return: A MetricsResult object containing the calculated Exact Match (EM) score.
-        """
-
-        if len(predictions) != len(labels):
-            raise ValueError("The number of predictions and labels must be the same.")
-        if len(predictions) == len(labels) == 0:
-            # Return Exact Match as 0 for no inputs
-            return MetricsResult({"exact_match": 0.0})
-
-        if regexes_to_ignore is not None:
-            for s in regexes_to_ignore:
-                predictions = np.array([re.sub(s, "", x) for x in predictions])
-                labels = np.array([re.sub(s, "", x) for x in labels])
-        else:
-            predictions = np.asarray(predictions)
-            labels = np.asarray(labels)
-
-        if ignore_case:
-            predictions = np.char.lower(predictions)
-            labels = np.char.lower(labels)
-
-        if ignore_punctuation:
-            repl_table = string.punctuation.maketrans("", "", string.punctuation)
-            predictions = np.char.translate(predictions, table=repl_table)
-            labels = np.char.translate(labels, table=repl_table)
-
-        if ignore_numbers:
-            repl_table = string.digits.maketrans("", "", string.digits)
-            predictions = np.char.translate(predictions, table=repl_table)
-            labels = np.char.translate(labels, table=repl_table)
-
-        score_list = predictions == labels
-        em = np.mean(score_list)
-        return MetricsResult({"exact_match": em})
+    def _calculate_em(self):
+        return MetricsResult({"exact_match": 1.0})
 
     def _calculate_sas(self):
-        val = 0
-        return MetricsResult({"exact_match": val})
+        return MetricsResult({"exact_match": None})
 
 
 def eval(
diff --git a/haystack/evaluation/eval_utils.py b/haystack/evaluation/eval_utils.py
deleted file mode 100644
index b03816a3de..0000000000
--- a/haystack/evaluation/eval_utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Any, Dict, List
-
-
-def get_answers_from_output(outputs: List[Dict[str, Any]], runnable_type: str) -> List[str]:
-    """
-    Extracts the answers from the output of a pipeline or component.
-
-    :param outputs: The outputs of the runnable.
-    :return: List of answers from the runnable output.
-    """
-    answers = []
-    if runnable_type == "pipeline":
-        # Iterate over output from each Pipeline run
-        for output in outputs:
-            # Iterate over output of component in each Pipeline run
-            for component_output in output.values():
-                # Only extract answers
-                for key in component_output.keys():
-                    if "answers" in key:
-                        for generated_answer in component_output["answers"]:
-                            if generated_answer.data:
-                                answers.append(generated_answer.data)
-    else:
-        # Iterate over output from each Component run
-        for output in outputs:
-            # Only extract answers
-            for key in output.keys():
-                if "answers" in key:
-                    for generated_answer in output["answers"]:
-                        if generated_answer.data:
-                            answers.append(generated_answer.data)
-    return answers
diff --git a/test/evaluation/__init__.py b/test/evaluation/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py
deleted file mode 100644
index e7e77e8b56..0000000000
--- a/test/evaluation/test_eval_exact_match.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import numpy as np
-import pytest
-
-from haystack.evaluation.eval import EvaluationResult
-
-
-# Define test cases for _calculate_em function
-class TestExactMatch:
-    @pytest.fixture
-    def evaluation_result(self):
-        runnable = None
-        inputs = []
-        outputs = []
-        expected_outputs = []
-        eval_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
-        return eval_result
-
-    def test_exact_match(self, evaluation_result):
-        """
-        Test exact match with default parameters
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-        em_result = evaluation_result._calculate_em(predictions, labels)
-
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_empty_inputs(self, evaluation_result):
-        """
-        Test exact match with empty inputs
-        """
-        predictions = []
-        labels = []
-        # Expecting 0% exact match for empty inputs
-        em_result = evaluation_result._calculate_em(predictions, labels)
-        assert em_result["exact_match"] == 0.0
-
-    def test_exact_match_different_data_types(self, evaluation_result):
-        """
-        Test exact match with different data types (numpy arrays)
-        """
-        predictions = np.array(["OpenSource", "HaystackAI", "LLMs"])
-        labels = np.array(["OpenSource", "HaystackAI", "LLMs"])
-
-        em_result = evaluation_result._calculate_em(predictions, labels)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_single_word(self, evaluation_result):
-        """
-        Test exact match with single-word inputs
-        """
-        predictions = ["OpenSource"]
-        labels = ["OpenSource"]
-
-        em_result = evaluation_result._calculate_em(predictions, labels)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_negative_case(self, evaluation_result):
-        """
-        Test exact match with deliberately mismatched predictions and labels
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["Source", "HaystackAI", "LLMs"]
-        expected_em = 2 / 3  # Expecting EM to be 2/3 as 2 out of 3 items match
-        em_result = evaluation_result._calculate_em(predictions, labels)
-        assert em_result["exact_match"] == expected_em
-
-    def test_exact_match_ignore_case(self, evaluation_result):
-        """
-        Test exact match with ignoring case sensitivity
-        """
-        predictions = ["OpenSource", "HaystackAI", "LLMs"]
-        labels = ["opensource", "HAYSTACKAI", "llMs"]
-
-        # Exact match after case ignoring
-        em_result = evaluation_result._calculate_em(predictions, labels, ignore_case=True)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_ignore_punctuation(self, evaluation_result):
-        """
-        Test exact match with ignoring punctuation
-        """
-        predictions = ["OpenSource!", "Haystack.AI", "LLMs,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        # Exact match after ignoring punctuation
-        em_result = evaluation_result._calculate_em(predictions, labels, ignore_punctuation=True)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_ignore_numbers(self, evaluation_result):
-        """
-        Test exact match with ignoring numbers
-        """
-        predictions = ["OpenSource123", "HaystackAI", "LLMs456"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        # Exact match after ignoring numbers
-        em_result = evaluation_result._calculate_em(predictions, labels, ignore_numbers=True)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_regex_ignore(self, evaluation_result):
-        """
-        Test exact match with ignoring specific regex patterns
-        """
-        predictions = ["Open123Source", "HaystackAI", "LLMs456"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        # Ignore numeric patterns
-        regex_to_ignore = [r"\d+"]
-        em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_multiple_ignore_regex(self, evaluation_result):
-        """
-        Test exact match with multiple ignoring parameters
-        """
-        predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        # Ignore numeric patterns and punctuation using regex
-        regex_to_ignore = [r"\d+", r"\W+"]
-        em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore)
-        assert em_result["exact_match"] == 1.0
-
-    def test_exact_match_multiple_ignore_combination(self, evaluation_result):
-        """
-        Test exact match with multiple ignoring parameters combined
-        """
-        predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"]
-        labels = ["OpenSource", "HaystackAI", "LLMs"]
-
-        # Ignore only special characters using regex
-        regex_to_ignore = [r"[^\w\s\d]+"]
-        em_result = evaluation_result._calculate_em(
-            predictions,
-            labels,
-            ignore_numbers=True,
-            ignore_punctuation=True,
-            ignore_case=True,
-            regexes_to_ignore=regex_to_ignore,
-        )
-        assert em_result["exact_match"] == 1.0
diff --git a/test/evaluation/test_eval_utils.py b/test/evaluation/test_eval_utils.py
deleted file mode 100644
index 3c67aeb579..0000000000
--- a/test/evaluation/test_eval_utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from haystack.dataclasses import GeneratedAnswer
-from haystack.evaluation.eval_utils import get_answers_from_output
-
-
-class TestEvalUtils:
-    def test_extract_answers_from_pipeline_output(self):
-        """
-        Test that the function correctly extracts answers from the output of a pipeline.
-        """
-        outputs = [
-            {
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
-                }
-            },
-            {
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
-                }
-            },
-            {
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]
-                }
-            },
-        ]
-
-        runnable_type = "pipeline"
-        expected_answers = ["Jean", "Mark", "Giorgio"]
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers
-
-    def test_extract_answers_from_component_output(self):
-        """
-        Test that the function correctly extracts answers from the output of a component.
-        """
-        outputs = [
-            {"answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]},
-            {"answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]},
-            {"answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]},
-        ]
-        runnable_type = "component"
-        expected_answers = ["Jean", "Mark", "Giorgio"]
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers
-
-    def test_ignore_other_output_keys(self):
-        """
-        Test that the function only extracts answers and ignores other output keys.
-        """
-        outputs = [
-            {
-                "llm": {"replies": ["llm_reply_1"]},
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
-                },
-            },
-            {
-                "llm": {"replies": ["llm_reply_2"]},
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
-                },
-            },
-            {
-                "llm": {"replies": ["llm_reply_3"]},
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]
-                },
-            },
-        ]
-
-        runnable_type = "pipeline"
-        expected_answers = ["Jean", "Mark", "Giorgio"]
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers
-
-    def test_handle_empty_outputs(self):
-        """
-        Test that the function correctly handles empty outputs.
-        """
-        outputs = []
-        runnable_type = "pipeline"
-        expected_answers = []
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers
-
-    def test_handle_missing_keys(self):
-        """
-        Test that the function correctly handles outputs with missing keys.
-        """
-        outputs = [
-            {
-                "llm": {"replies": ["llm_reply_1"]},
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]
-                },
-            },
-            {
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
-                }
-            },
-        ]
-
-        runnable_type = "pipeline"
-        expected_answers = ["Jean", "Mark"]
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers
-
-    def test_handle_missing_values(self):
-        """
-        Test that the function correctly handles outputs with missing values.
-        """
-        outputs = [
-            {"answer_builder": {"answers": []}},
-            {
-                "answer_builder": {
-                    "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]
-                }
-            },
-        ]
-        runnable_type = "pipeline"
-        expected_answers = ["Mark"]
-
-        assert get_answers_from_output(outputs, runnable_type) == expected_answers

From a015fc9d1ea4c2410aad68eeefc525769ab2c012 Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Mon, 8 Jan 2024 14:00:52 +0530
Subject: [PATCH 06/10] Update release notes

---
 ...> add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml} | 1 -
 1 file changed, 1 deletion(-)
 rename releasenotes/notes/{add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml => add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml} (77%)

diff --git a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml b/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml
similarity index 77%
rename from releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml
rename to releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml
index 920eb88d34..ab163705b7 100644
--- a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml
+++ b/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml
@@ -4,4 +4,3 @@ features:
     Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics.
     Adds `Metric` class to store list of available metrics.
     Adds `MetricsResult` class to store the metric values computed during the evaluation.
-    Adds a function to EvaluationResult for calculating the Exact Match metric.

From f4c71e45d82dc675fa8f27a1266070d0d0c7a5ba Mon Sep 17 00:00:00 2001
From: awinml <97467100+awinml@users.noreply.github.com>
Date: Mon, 8 Jan 2024 14:18:30 +0530
Subject: [PATCH 07/10] Remove unnecessary metrics implementation

---
 haystack/evaluation/eval.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 9aee19f892..3a2d053069 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -29,12 +29,6 @@ def __init__(
         self.outputs = outputs
         self.expected_outputs = expected_outputs
 
-        # Determine the type of the runnable
-        if str(type(runnable).__name__) == "Pipeline":
-            self.runnable_type = "pipeline"
-        else:
-            self.runnable_type = "component"
-
     # pylint: disable=too-many-return-statements
     def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
         """

From 50a75bddb0ca7495725a9fbe681c85706a060ac3 Mon Sep 17 00:00:00 2001
From: Silvano Cerza <silvanocerza@gmail.com>
Date: Wed, 10 Jan 2024 09:46:11 +0100
Subject: [PATCH 08/10] Simplify logic to run supported metrics

---
 haystack/evaluation/eval.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 3a2d053069..6fdc7138a6 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -29,6 +29,17 @@ def __init__(
         self.outputs = outputs
         self.expected_outputs = expected_outputs
 
+        # Mapping of metrics to their corresponding functions.
+        # This should be kept in sync with the Metric enum
+        self._supported_metrics = {
+            Metric.RECALL: self._calculate_recall,
+            Metric.MRR: self._calculate_mrr,
+            Metric.MAP: self._calculate_map,
+            Metric.F1: self._calculate_f1,
+            Metric.EM: self._calculate_em,
+            Metric.SAS: self._calculate_sas,
+        }
+
     # pylint: disable=too-many-return-statements
     def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
         """
@@ -37,23 +48,9 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]],
         :param metric: The Metric indicating the type of metric to calculate or custom function to compute.
         :return: MetricsResult containing the calculated metric.
         """
-        if metric == Metric.RECALL:
-            return self._calculate_recall(**kwargs)
-
-        elif metric == Metric.F1:
-            return self._calculate_f1(**kwargs)
-
-        elif metric == Metric.MRR:
-            return self._calculate_mrr(**kwargs)
-
-        elif metric == Metric.MAP:
-            return self._calculate_map(**kwargs)
-
-        elif metric == Metric.EM:
-            return self._calculate_em(**kwargs)
 
-        elif metric == Metric.SAS:
-            return self._calculate_sas(**kwargs)
+        if metric in self._supported_metrics:
+            return self._supported_metrics[metric](**kwargs)
 
         return metric(self, **kwargs)
 

From bbf7c4ddf0a75f8e83924074e5f2628e91833afc Mon Sep 17 00:00:00 2001
From: Silvano Cerza <silvanocerza@gmail.com>
Date: Wed, 10 Jan 2024 09:46:26 +0100
Subject: [PATCH 09/10] Add some evaluation tests

---
 test/evaluation/test_eval.py | 42 ++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 test/evaluation/test_eval.py

diff --git a/test/evaluation/test_eval.py b/test/evaluation/test_eval.py
new file mode 100644
index 0000000000..095c51d4f5
--- /dev/null
+++ b/test/evaluation/test_eval.py
@@ -0,0 +1,42 @@
+from unittest.mock import MagicMock
+
+from haystack.core.pipeline import Pipeline
+from haystack.evaluation.eval import EvaluationResult
+from haystack.evaluation.metrics import Metric
+
+
+class TestEvaluationResult:
+    def test_init(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        assert result.runnable == runnable
+        assert result.inputs == []
+        assert result.outputs == []
+        assert result.expected_outputs == []
+
+    def test_supported_metrics_contains_all_metrics(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        supported_metrics = [m.name for m in result._supported_metrics.keys()]
+        all_metric_names = [m.name for m in Metric]
+        assert supported_metrics == all_metric_names
+
+    def test_calculate_metrics_with_supported_metric(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+        result._supported_metrics[Metric.RECALL] = MagicMock()
+        result.calculate_metrics(metric=Metric.RECALL)
+
+        assert result._supported_metrics[Metric.RECALL].called_once_with()
+
+    def test_calculate_metrics_with_non_supported_metric(self):
+        runnable = Pipeline()
+        result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
+
+        unsupported_metric = MagicMock()
+
+        result.calculate_metrics(metric=unsupported_metric, some_argument="some_value")
+
+        assert unsupported_metric.called_once_with(some_argument="some_value")

From 850b57b9cb3e7ec2e744cb3e92cfd57ce39f9de5 Mon Sep 17 00:00:00 2001
From: Silvano Cerza <silvanocerza@gmail.com>
Date: Wed, 10 Jan 2024 09:57:08 +0100
Subject: [PATCH 10/10] Fix linting

---
 haystack/evaluation/eval.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py
index 6fdc7138a6..28b4b76bbe 100644
--- a/haystack/evaluation/eval.py
+++ b/haystack/evaluation/eval.py
@@ -40,7 +40,6 @@ def __init__(
             Metric.SAS: self._calculate_sas,
         }
 
-    # pylint: disable=too-many-return-statements
     def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
         """
         Calculate evaluation metrics based on the provided Metric or using the custom metric function.
@@ -49,7 +48,7 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]],
         :return: MetricsResult containing the calculated metric.
         """
 
-        if metric in self._supported_metrics:
+        if isinstance(metric, Metric):
             return self._supported_metrics[metric](**kwargs)
 
         return metric(self, **kwargs)