From d55057fa212933f02a5c54b8adf3162cd6cb0de2 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 24 Jan 2024 17:03:38 +0530 Subject: [PATCH 1/2] Add F1 metric --- .../test_eval_extractive_qa_pipeline.py | 28 ++- e2e/pipelines/test_eval_rag_pipelines.py | 56 ++++-- haystack/evaluation/eval.py | 65 ++++++- test/evaluation/test_eval_f1.py | 178 ++++++++++++++++++ 4 files changed, 304 insertions(+), 23 deletions(-) create mode 100644 test/evaluation/test_eval_f1.py diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py index 57ec1b63de..f04ddd4725 100644 --- a/e2e/pipelines/test_eval_extractive_qa_pipeline.py +++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py @@ -115,14 +115,28 @@ def test_extractive_qa_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == qa_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") - metrics_custom_parameters = eval_result.calculate_metrics( + # Test Exact Match + em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") + em_custom_parameters = eval_result.calculate_metrics( Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True ) - # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + # Save EM metric results to json + em_default.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert em_default["exact_match"] == 1.0 + assert em_custom_parameters["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert em_default == json.load(f) + + # Test F1 + f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers") + f1_custom_parameters = eval_result.calculate_metrics( + Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save F1 metric results to json + f1_default.save(tmp_path / "f1_score.json") + + assert f1_default["f1"] == 1.0 + assert f1_custom_parameters["f1"] == 1.0 + with open(tmp_path / "f1_score.json", "r") as f: + assert f1_default == json.load(f) diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py index 34e7a888ae..23e674d22a 100644 --- a/e2e/pipelines/test_eval_rag_pipelines.py +++ b/e2e/pipelines/test_eval_rag_pipelines.py @@ -116,17 +116,31 @@ def test_bm25_rag_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") - metrics_custom_parameters = eval_result.calculate_metrics( + # Test Exact Match + em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") + em_custom_parameters = eval_result.calculate_metrics( Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True ) - # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + # Save EM metric results to json + em_default.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert em_default["exact_match"] == 1.0 + assert em_custom_parameters["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert em_default == json.load(f) + + # Test F1 + f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers") + f1_custom_parameters = eval_result.calculate_metrics( + Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save F1 metric results to json + f1_default.save(tmp_path / "f1_score.json") + + assert f1_default["f1"] == 1.0 + assert f1_custom_parameters["f1"] == 1.0 + with open(tmp_path / "f1_score.json", "r") as f: + assert f1_default == json.load(f) def test_embedding_retrieval_rag_pipeline(tmp_path): @@ -248,14 +262,28 @@ def test_embedding_retrieval_rag_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") - metrics_custom_parameters = eval_result.calculate_metrics( + # Test Exact Match + em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers") + em_custom_parameters = eval_result.calculate_metrics( Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True ) - # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + # Save EM metric results to json + em_default.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert em_default["exact_match"] == 1.0 + assert em_custom_parameters["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert em_default == json.load(f) + + # Test F1 + f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers") + f1_custom_parameters = eval_result.calculate_metrics( + Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save F1 metric results to json + f1_default.save(tmp_path / "f1_score.json") + + assert f1_default["f1"] == 1.0 + assert f1_custom_parameters["f1"] == 1.0 + with open(tmp_path / "f1_score.json", "r") as f: + assert f1_default == json.load(f) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 49d6a87e1e..7951814c18 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -1,3 +1,4 @@ +import collections from typing import Any, Callable, Dict, List, Union import numpy as np @@ -71,8 +72,68 @@ def _calculate_map(self): def _calculate_mrr(self): return MetricsResult({"mean_reciprocal_rank": None}) - def _calculate_f1(self): - return MetricsResult({"f1": None}) + def _compute_f1_single(self, label_toks: List[str], pred_toks: List[str]) -> float: + """ + Compute F1 score for a single sample. + """ + common: collections.Counter = collections.Counter(label_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(label_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(label_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(label_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + def _calculate_f1( + self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False + ) -> MetricsResult: + """ + Calculates the F1 score between two lists of predictions and labels. + F1 score measures the word overlap between the predicted text and the corresponding ground truth label. + + :param output_key: The key of the output to use for comparison. + :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings + matching these regular expressions from both predictions and labels before comparison. Defaults to None. + :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False. + :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before + comparison. Defaults to False. + :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels + before comparison. Defaults to False. + :return: A MetricsResult object containing the calculated Exact Match (EM) score. + """ + + predictions = get_answers_from_output( + outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type + ) + labels = get_answers_from_output( + outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type + ) + + if len(predictions) != len(labels): + raise ValueError("The number of predictions and labels must be the same.") + if len(predictions) == len(labels) == 0: + # Return F1 as 0 for no inputs + return MetricsResult({"f1": 0.0}) + + predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) + labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers) + + # Tokenize by splitting on spaces + tokenized_predictions = [pred.split() for pred in predictions] + tokenized_labels = [label.split() for label in labels] + + f1_scores = [ + self._compute_f1_single(label_toks, pred_toks) + for label_toks, pred_toks in zip(tokenized_labels, tokenized_predictions) + ] + + f1 = np.mean(f1_scores) + + return MetricsResult({"f1": f1}) def _calculate_em( self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False diff --git a/test/evaluation/test_eval_f1.py b/test/evaluation/test_eval_f1.py new file mode 100644 index 0000000000..cedcda466a --- /dev/null +++ b/test/evaluation/test_eval_f1.py @@ -0,0 +1,178 @@ +import pytest + +from haystack import Pipeline +from haystack.dataclasses import GeneratedAnswer +from haystack.evaluation.eval import EvaluationResult + + +class TestF1: + def create_evaluation_result(self, predictions, labels): + """ + Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the f1. + """ + runnable = Pipeline() + inputs = [] + outputs = [ + {"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}} + for pred in predictions + ] + expected_outputs = [ + {"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}} + for label in labels + ] + evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) + return evaluation_result + + def test_f1_empty_inputs(self): + """ + Test f1 with empty inputs + """ + runnable = Pipeline() + inputs = [] + outputs = [ + {"answer_builder": {"answers": []}}, + {"answer_builder": {"answers": []}}, + {"answer_builder": {"answers": []}}, + ] + expected_outputs = [ + {"answer_builder": {"answers": []}}, + {"answer_builder": {"answers": []}}, + {"answer_builder": {"answers": []}}, + ] + evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) + # Expecting 0% f1 for empty inputs + f1_result = evaluation_result._calculate_f1(output_key="answers") + + assert f1_result["f1"] == 0.0 + + def test_calculate_f1_with_different_lengths(self): + """ + Test f1 with default parameters + """ + predictions = ["OpenSource", "HaystackAI", "LLMs"] + labels = ["OpenSource", "HaystackAI"] + evaluation_result = self.create_evaluation_result(predictions, labels) + + with pytest.raises(ValueError, match="The number of predictions and labels must be the same."): + evaluation_result._calculate_f1(output_key="answers") + + def test_f1_same_inputs(self): + """ + Test f1 with default parameters + """ + predictions = ["OpenSource", "HaystackAI", "LLMs"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + evaluation_result = self.create_evaluation_result(predictions, labels) + f1_result = evaluation_result._calculate_f1(output_key="answers") + + assert f1_result["f1"] == 1.0 + + def test_f1_single_word(self): + """ + Test f1 with single-word inputs + """ + predictions = ["Open Source"] + labels = ["Source"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + f1_result = evaluation_result._calculate_f1(output_key="answers") + + assert f1_result["f1"] == pytest.approx(2 / 3) + + def test_f1_negative_case(self): + """ + Test f1 with deliberately mismatched predictions and labels + """ + predictions = ["Open Source", "HaystackAI"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + f1_result = evaluation_result._calculate_f1(output_key="answers") + + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_ignore_case(self): + """ + Test f1 with ignoring case sensitivity + """ + predictions = ["Open Source", "HaystackAI"] + labels = ["source", "HAYSTACKAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # F1 after case ignoring + f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_case=True) + + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_ignore_punctuation(self): + """ + Test f1 with ignoring punctuation + """ + predictions = ["Open Source!", "Haystack.AI"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # F1 after ignoring punctuation + f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_punctuation=True) + + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_ignore_numbers(self): + """ + Test f1 with ignoring numbers + """ + predictions = ["Open Source123", "HaystackAI"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # F1 after ignoring numbers + f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_numbers=True) + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_regex_ignore(self): + """ + Test f1 with ignoring specific regex patterns + """ + predictions = ["Open123 Source", "HaystackAI"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # Ignore numeric patterns + regex_to_ignore = [r"\d+"] + f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore) + + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_multiple_ignore_regex(self): + """ + Test f1 with multiple ignoring parameters + """ + predictions = ["Open123! Source", "Haystack.AI"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # Ignore numeric patterns and punctuation excluding whitespaces + regex_to_ignore = [r"\d+", r"[^\w\s]"] + f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore) + + assert f1_result["f1"] == pytest.approx(5 / 6) + + def test_f1_multiple_ignore_combination(self): + """ + Test f1 with multiple ignoring parameters combined + """ + predictions = ["Open%123. !$Source", "Haystack.AI##"] + labels = ["Source", "HaystackAI"] + + evaluation_result = self.create_evaluation_result(predictions, labels) + # Ignore only special characters using regex + regex_to_ignore = [r"[^\w\s\d]+"] + f1_result = evaluation_result._calculate_f1( + output_key="answers", + ignore_numbers=True, + ignore_punctuation=True, + ignore_case=True, + regexes_to_ignore=regex_to_ignore, + ) + + assert f1_result["f1"] == pytest.approx(5 / 6) From 9994c8756a43418f21f6059a449bdca20d1dd490 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 24 Jan 2024 17:07:22 +0530 Subject: [PATCH 2/2] Add release notes --- releasenotes/notes/add-f1-d54cc900bec753f7.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 releasenotes/notes/add-f1-d54cc900bec753f7.yaml diff --git a/releasenotes/notes/add-f1-d54cc900bec753f7.yaml b/releasenotes/notes/add-f1-d54cc900bec753f7.yaml new file mode 100644 index 0000000000..6c6fcabdcd --- /dev/null +++ b/releasenotes/notes/add-f1-d54cc900bec753f7.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds support for the F1 metric to `EvaluationResult.calculate_metrics(...)`: + ```python + from haystack.evaluation.metrics import Metric + f1_metric = eval_result.calculate_metrics(Metric.F1, output_key="answers") + ```