Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add F1 metric #6822

Merged
merged 3 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions e2e/pipelines/test_eval_extractive_qa_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,28 @@ def test_extractive_qa_pipeline(tmp_path):
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()

metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
metrics_custom_parameters = eval_result.calculate_metrics(
# Test Exact Match
em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
em_custom_parameters = eval_result.calculate_metrics(
Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save metric results to json
metrics_default.save(tmp_path / "exact_match_score.json")
# Save EM metric results to json
em_default.save(tmp_path / "exact_match_score.json")

assert metrics_default["exact_match"] == 1.0
assert metrics_custom_parameters["exact_match"] == 1.0
assert em_default["exact_match"] == 1.0
assert em_custom_parameters["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics_default == json.load(f)
assert em_default == json.load(f)

# Test F1
f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
f1_custom_parameters = eval_result.calculate_metrics(
Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save F1 metric results to json
f1_default.save(tmp_path / "f1_score.json")

assert f1_default["f1"] == 1.0
assert f1_custom_parameters["f1"] == 1.0
with open(tmp_path / "f1_score.json", "r") as f:
assert f1_default == json.load(f)
56 changes: 42 additions & 14 deletions e2e/pipelines/test_eval_rag_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,17 +116,31 @@ def test_bm25_rag_pipeline(tmp_path):
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
metrics_custom_parameters = eval_result.calculate_metrics(
# Test Exact Match
em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
em_custom_parameters = eval_result.calculate_metrics(
Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save metric results to json
metrics_default.save(tmp_path / "exact_match_score.json")
# Save EM metric results to json
em_default.save(tmp_path / "exact_match_score.json")

assert metrics_default["exact_match"] == 1.0
assert metrics_custom_parameters["exact_match"] == 1.0
assert em_default["exact_match"] == 1.0
assert em_custom_parameters["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics_default == json.load(f)
assert em_default == json.load(f)

# Test F1
f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
f1_custom_parameters = eval_result.calculate_metrics(
Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save F1 metric results to json
f1_default.save(tmp_path / "f1_score.json")

assert f1_default["f1"] == 1.0
assert f1_custom_parameters["f1"] == 1.0
with open(tmp_path / "f1_score.json", "r") as f:
assert f1_default == json.load(f)


def test_embedding_retrieval_rag_pipeline(tmp_path):
Expand Down Expand Up @@ -248,14 +262,28 @@ def test_embedding_retrieval_rag_pipeline(tmp_path):
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
metrics_custom_parameters = eval_result.calculate_metrics(
# Test Exact Match
em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
em_custom_parameters = eval_result.calculate_metrics(
Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save metric results to json
metrics_default.save(tmp_path / "exact_match_score.json")
# Save EM metric results to json
em_default.save(tmp_path / "exact_match_score.json")

assert metrics_default["exact_match"] == 1.0
assert metrics_custom_parameters["exact_match"] == 1.0
assert em_default["exact_match"] == 1.0
assert em_custom_parameters["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics_default == json.load(f)
assert em_default == json.load(f)

# Test F1
f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
f1_custom_parameters = eval_result.calculate_metrics(
Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
)
# Save F1 metric results to json
f1_default.save(tmp_path / "f1_score.json")

assert f1_default["f1"] == 1.0
assert f1_custom_parameters["f1"] == 1.0
with open(tmp_path / "f1_score.json", "r") as f:
assert f1_default == json.load(f)
65 changes: 63 additions & 2 deletions haystack/evaluation/eval.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import collections
from typing import Any, Callable, Dict, List, Union

import numpy as np
Expand Down Expand Up @@ -71,8 +72,68 @@ def _calculate_map(self):
def _calculate_mrr(self):
return MetricsResult({"mean_reciprocal_rank": None})

def _calculate_f1(self):
return MetricsResult({"f1": None})
def _compute_f1_single(self, label_toks: List[str], pred_toks: List[str]) -> float:
"""
Compute F1 score for a single sample.
"""
common: collections.Counter = collections.Counter(label_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(label_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(label_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(label_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1

def _calculate_f1(
self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False
) -> MetricsResult:
"""
Calculates the F1 score between two lists of predictions and labels.
F1 score measures the word overlap between the predicted text and the corresponding ground truth label.

:param output_key: The key of the output to use for comparison.
:param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings
matching these regular expressions from both predictions and labels before comparison. Defaults to None.
:param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False.
:param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before
comparison. Defaults to False.
:param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels
before comparison. Defaults to False.
:return: A MetricsResult object containing the calculated Exact Match (EM) score.
"""

predictions = get_answers_from_output(
outputs=self.outputs, output_key=output_key, runnable_type=self.runnable_type
)
labels = get_answers_from_output(
outputs=self.expected_outputs, output_key=output_key, runnable_type=self.runnable_type
)

if len(predictions) != len(labels):
raise ValueError("The number of predictions and labels must be the same.")
if len(predictions) == len(labels) == 0:
# Return F1 as 0 for no inputs
return MetricsResult({"f1": 0.0})

predictions = preprocess_text(predictions, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)
labels = preprocess_text(labels, regexes_to_ignore, ignore_case, ignore_punctuation, ignore_numbers)

# Tokenize by splitting on spaces
tokenized_predictions = [pred.split() for pred in predictions]
tokenized_labels = [label.split() for label in labels]

f1_scores = [
self._compute_f1_single(label_toks, pred_toks)
for label_toks, pred_toks in zip(tokenized_labels, tokenized_predictions)
]

f1 = np.mean(f1_scores)

return MetricsResult({"f1": f1})

def _calculate_em(
self, output_key: str, regexes_to_ignore=None, ignore_case=False, ignore_punctuation=False, ignore_numbers=False
Expand Down
8 changes: 8 additions & 0 deletions releasenotes/notes/add-f1-d54cc900bec753f7.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Adds support for the F1 metric to `EvaluationResult.calculate_metrics(...)`:
```python
from haystack.evaluation.metrics import Metric
f1_metric = eval_result.calculate_metrics(Metric.F1, output_key="answers")
```
178 changes: 178 additions & 0 deletions test/evaluation/test_eval_f1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import pytest

from haystack import Pipeline
from haystack.dataclasses import GeneratedAnswer
from haystack.evaluation.eval import EvaluationResult


class TestF1:
def create_evaluation_result(self, predictions, labels):
"""
Creates an evaluation result of a RAG pipeline using the list of predictions and labels for testing the f1.
"""
runnable = Pipeline()
inputs = []
outputs = [
{"answer_builder": {"answers": [GeneratedAnswer(data=pred, query="", documents=[], meta={})]}}
for pred in predictions
]
expected_outputs = [
{"answer_builder": {"answers": [GeneratedAnswer(data=label, query="", documents=[], meta={})]}}
for label in labels
]
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
return evaluation_result

def test_f1_empty_inputs(self):
"""
Test f1 with empty inputs
"""
runnable = Pipeline()
inputs = []
outputs = [
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
]
expected_outputs = [
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
{"answer_builder": {"answers": []}},
]
evaluation_result = EvaluationResult(runnable, inputs, outputs, expected_outputs)
# Expecting 0% f1 for empty inputs
f1_result = evaluation_result._calculate_f1(output_key="answers")

assert f1_result["f1"] == 0.0

def test_calculate_f1_with_different_lengths(self):
"""
Test f1 with default parameters
"""
predictions = ["OpenSource", "HaystackAI", "LLMs"]
labels = ["OpenSource", "HaystackAI"]
evaluation_result = self.create_evaluation_result(predictions, labels)

with pytest.raises(ValueError, match="The number of predictions and labels must be the same."):
evaluation_result._calculate_f1(output_key="answers")

def test_f1_same_inputs(self):
"""
Test f1 with default parameters
"""
predictions = ["OpenSource", "HaystackAI", "LLMs"]
labels = ["OpenSource", "HaystackAI", "LLMs"]
evaluation_result = self.create_evaluation_result(predictions, labels)
f1_result = evaluation_result._calculate_f1(output_key="answers")

assert f1_result["f1"] == 1.0

def test_f1_single_word(self):
"""
Test f1 with single-word inputs
"""
predictions = ["Open Source"]
labels = ["Source"]

evaluation_result = self.create_evaluation_result(predictions, labels)
f1_result = evaluation_result._calculate_f1(output_key="answers")

assert f1_result["f1"] == pytest.approx(2 / 3)

def test_f1_negative_case(self):
"""
Test f1 with deliberately mismatched predictions and labels
"""
predictions = ["Open Source", "HaystackAI"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
f1_result = evaluation_result._calculate_f1(output_key="answers")

assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_ignore_case(self):
"""
Test f1 with ignoring case sensitivity
"""
predictions = ["Open Source", "HaystackAI"]
labels = ["source", "HAYSTACKAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# F1 after case ignoring
f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_case=True)

assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_ignore_punctuation(self):
"""
Test f1 with ignoring punctuation
"""
predictions = ["Open Source!", "Haystack.AI"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# F1 after ignoring punctuation
f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_punctuation=True)

assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_ignore_numbers(self):
"""
Test f1 with ignoring numbers
"""
predictions = ["Open Source123", "HaystackAI"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# F1 after ignoring numbers
f1_result = evaluation_result._calculate_f1(output_key="answers", ignore_numbers=True)
assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_regex_ignore(self):
"""
Test f1 with ignoring specific regex patterns
"""
predictions = ["Open123 Source", "HaystackAI"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore numeric patterns
regex_to_ignore = [r"\d+"]
f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)

assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_multiple_ignore_regex(self):
"""
Test f1 with multiple ignoring parameters
"""
predictions = ["Open123! Source", "Haystack.AI"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore numeric patterns and punctuation excluding whitespaces
regex_to_ignore = [r"\d+", r"[^\w\s]"]
f1_result = evaluation_result._calculate_f1(output_key="answers", regexes_to_ignore=regex_to_ignore)

assert f1_result["f1"] == pytest.approx(5 / 6)

def test_f1_multiple_ignore_combination(self):
"""
Test f1 with multiple ignoring parameters combined
"""
predictions = ["Open%123. !$Source", "Haystack.AI##"]
labels = ["Source", "HaystackAI"]

evaluation_result = self.create_evaluation_result(predictions, labels)
# Ignore only special characters using regex
regex_to_ignore = [r"[^\w\s\d]+"]
f1_result = evaluation_result._calculate_f1(
output_key="answers",
ignore_numbers=True,
ignore_punctuation=True,
ignore_case=True,
regexes_to_ignore=regex_to_ignore,
)

assert f1_result["f1"] == pytest.approx(5 / 6)