deepset-ai · silvanocerza · Jan 26, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 24, 2024
@@ -115,14 +115,28 @@ def test_extractive_qa_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
-    metrics_custom_parameters = eval_result.calculate_metrics(
+    # Test Exact Match
+    em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
+    em_custom_parameters = eval_result.calculate_metrics(
         Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
     )
-    # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    # Save EM metric results to json
+    em_default.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert em_default["exact_match"] == 1.0
+    assert em_custom_parameters["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert em_default == json.load(f)
+
+    # Test F1
+    f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
+    f1_custom_parameters = eval_result.calculate_metrics(
+        Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save F1 metric results to json
+    f1_default.save(tmp_path / "f1_score.json")
+
+    assert f1_default["f1"] == 1.0
+    assert f1_custom_parameters["f1"] == 1.0
+    with open(tmp_path / "f1_score.json", "r") as f:
+        assert f1_default == json.load(f)
@@ -116,17 +116,31 @@ def test_bm25_rag_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
-    metrics_custom_parameters = eval_result.calculate_metrics(
+    # Test Exact Match
+    em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
+    em_custom_parameters = eval_result.calculate_metrics(
         Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
     )
-    # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    # Save EM metric results to json
+    em_default.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert em_default["exact_match"] == 1.0
+    assert em_custom_parameters["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert em_default == json.load(f)
+
+    # Test F1
+    f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
+    f1_custom_parameters = eval_result.calculate_metrics(
+        Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save F1 metric results to json
+    f1_default.save(tmp_path / "f1_score.json")
+
+    assert f1_default["f1"] == 1.0
+    assert f1_custom_parameters["f1"] == 1.0
+    with open(tmp_path / "f1_score.json", "r") as f:
+        assert f1_default == json.load(f)
 
 
 def test_embedding_retrieval_rag_pipeline(tmp_path):
@@ -248,14 +262,28 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
     assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
     assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
 
-    metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
-    metrics_custom_parameters = eval_result.calculate_metrics(
+    # Test Exact Match
+    em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
+    em_custom_parameters = eval_result.calculate_metrics(
         Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
     )
-    # Save metric results to json
-    metrics_default.save(tmp_path / "exact_match_score.json")
+    # Save EM metric results to json
+    em_default.save(tmp_path / "exact_match_score.json")
 
-    assert metrics_default["exact_match"] == 1.0
-    assert metrics_custom_parameters["exact_match"] == 1.0
+    assert em_default["exact_match"] == 1.0
+    assert em_custom_parameters["exact_match"] == 1.0
     with open(tmp_path / "exact_match_score.json", "r") as f:
-        assert metrics_default == json.load(f)
+        assert em_default == json.load(f)
+
+    # Test F1
+    f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
+    f1_custom_parameters = eval_result.calculate_metrics(
+        Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
+    )
+    # Save F1 metric results to json
+    f1_default.save(tmp_path / "f1_score.json")
+
+    assert f1_default["f1"] == 1.0
+    assert f1_custom_parameters["f1"] == 1.0
+    with open(tmp_path / "f1_score.json", "r") as f:
+        assert f1_default == json.load(f)
@@ -1,3 +1,4 @@
+import collections
 from typing import Any, Callable, Dict, List, Union
 
 import numpy as np
@@ -71,8 +72,68 @@ def _calculate_map(self):
     def _calculate_mrr(self):
         return MetricsResult({"mean_reciprocal_rank": None})
 
-    def _calculate_f1(self):
-        return MetricsResult({"f1": None})
+    def _compute_f1_single(self, label_toks: List[str], pred_toks: List[str]) -> float:
+        """
+        Compute F1 score for a single sample.
+        """
+        common: collections.Counter = collections.Counter(label_toks) & collections.Counter(pred_toks)
+        num_same = sum(common.values())
+        if len(label_toks) == 0 or len(pred_toks) == 0:
+            # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+            return int(label_toks == pred_toks)
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(pred_toks)
+        recall = 1.0 * num_same / len(label_toks)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    def _calculate_f1(
+        self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False
+    ) -> MetricsResult:
+        """
+        Calculates the F1 score between two lists of predictions and labels.
+        F1 score measures the word overlap between the predicted text and the corresponding ground truth label.
+
+        :param output_key: The key of the output to use for comparison.
+        :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
+            matching these regular expressions from both predictions and labels before comparison. Defaults to None.
+        :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
+        :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
+            comparison. Defaults to False.
+        :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
+            before comparison. Defaults to False.
+        :return: A MetricsResult object containing the calculated Exact Match (EM) score.
+        """
+
+        predictions = get_answers_from_output(
+            outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type
+        )
+        labels = get_answers_from_output(
+            outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type
+        )
+
+        if len(predictions) != len(labels):
+            raise ValueError("The number of predictions and labels must be the same.")
+        if len(predictions) == len(labels) == 0:
+            # Return F1 as 0 for no inputs
+            return MetricsResult({"f1": 0.0})
+
+        predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
+        labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
+
+        # Tokenize by splitting on spaces
+        tokenized_predictions = [pred.split() for pred in predictions]
+        tokenized_labels = [label.split() for label in labels]
+
+        f1_scores = [
+            self._compute_f1_single(label_toks, pred_toks)
+            for label_toks, pred_toks in zip(tokenized_labels, tokenized_predictions)
+        ]
+
+        f1 = np.mean(f1_scores)
+
+        return MetricsResult({"f1": f1})
 
     def _calculate_em(
         self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False

@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Adds support for the F1 metric to `EvaluationResult.calculate_metrics(...)`:
+    ```python
+    from haystack.evaluation.metrics import Metric
+    f1_metric = eval_result.calculate_metrics(Metric.F1, output_key="answers")
+    ```
@@ -0,0 +1,178 @@
+import pytest
+
+from haystack import Pipeline
+from haystack.dataclasses import GeneratedAnswer
+from haystack.evaluation.eval import EvaluationResult
+
+
+class TestF1:
+    def create_evaluation_result(self, predictions, labels):
+        """
+        Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the f1.
+        """
+        runnable = Pipeline()
+        inputs = []
+        outputs = [
+            {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
+            for pred in predictions
+        ]
+        expected_outputs = [
+            {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
+            for label in labels
+        ]
+        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
+        return evaluation_result
+
+    def test_f1_empty_inputs(self):
+        """
+        Test f1 with empty inputs
+        """
+        runnable = Pipeline()
+        inputs = []
+        outputs = [
+            {"answer_builder": {"answers": []}},
+            {"answer_builder": {"answers": []}},
+            {"answer_builder": {"answers": []}},
+        ]
+        expected_outputs = [
+            {"answer_builder": {"answers": []}},
+            {"answer_builder": {"answers": []}},
+            {"answer_builder": {"answers": []}},
+        ]
+        evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
+        # Expecting 0% f1 for empty inputs
+        f1_result = evaluation_result._calculate_f1(output_key="answers")
+
+        assert f1_result["f1"] == 0.0
+
+    def test_calculate_f1_with_different_lengths(self):
+        """
+        Test f1 with default parameters
+        """
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        labels = ["OpenSource", "HaystackAI"]
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+
+        with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
+            evaluation_result._calculate_f1(output_key="answers")
+
+    def test_f1_same_inputs(self):
+        """
+        Test f1 with default parameters
+        """
+        predictions = ["OpenSource", "HaystackAI", "LLMs"]
+        labels = ["OpenSource", "HaystackAI", "LLMs"]
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        f1_result = evaluation_result._calculate_f1(output_key="answers")
+
+        assert f1_result["f1"] == 1.0
+
+    def test_f1_single_word(self):
+        """
+        Test f1 with single-word inputs
+        """
+        predictions = ["Open Source"]
+        labels = ["Source"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        f1_result = evaluation_result._calculate_f1(output_key="answers")
+
+        assert f1_result["f1"] == pytest.approx(2 / 3)
+
+    def test_f1_negative_case(self):
+        """
+        Test f1 with deliberately mismatched predictions and labels
+        """
+        predictions = ["Open Source", "HaystackAI"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        f1_result = evaluation_result._calculate_f1(output_key="answers")
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_ignore_case(self):
+        """
+        Test f1 with ignoring case sensitivity
+        """
+        predictions = ["Open Source", "HaystackAI"]
+        labels = ["source", "HAYSTACKAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # F1 after case ignoring
+        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_case=True)
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_ignore_punctuation(self):
+        """
+        Test f1 with ignoring punctuation
+        """
+        predictions = ["Open Source!", "Haystack.AI"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # F1 after ignoring punctuation
+        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_punctuation=True)
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_ignore_numbers(self):
+        """
+        Test f1 with ignoring numbers
+        """
+        predictions = ["Open Source123", "HaystackAI"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # F1 after ignoring numbers
+        f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_numbers=True)
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_regex_ignore(self):
+        """
+        Test f1 with ignoring specific regex patterns
+        """
+        predictions = ["Open123 Source", "HaystackAI"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # Ignore numeric patterns
+        regex_to_ignore = [r"\d+"]
+        f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_multiple_ignore_regex(self):
+        """
+        Test f1 with multiple ignoring parameters
+        """
+        predictions = ["Open123! Source", "Haystack.AI"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # Ignore numeric patterns and punctuation excluding whitespaces
+        regex_to_ignore = [r"\d+", r"[^\w\s]"]
+        f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)
+
+    def test_f1_multiple_ignore_combination(self):
+        """
+        Test f1 with multiple ignoring parameters combined
+        """
+        predictions = ["Open%123. !$Source", "Haystack.AI##"]
+        labels = ["Source", "HaystackAI"]
+
+        evaluation_result = self.create_evaluation_result(predictions, labels)
+        # Ignore only special characters using regex
+        regex_to_ignore = [r"[^\w\s\d]+"]
+        f1_result = evaluation_result._calculate_f1(
+            output_key="answers",
+            ignore_numbers=True,
+            ignore_punctuation=True,
+            ignore_case=True,
+            regexes_to_ignore=regex_to_ignore,
+        )
+
+        assert f1_result["f1"] == pytest.approx(5 / 6)