Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add calculate_metrics and MetricsResult #6680

Merged
merged 15 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion e2e/pipelines/test_eval_extractive_qa_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import json

from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.dataclasses import Document, ExtractedAnswer
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
from haystack.evaluation.metrics import Metric


def test_extractive_qa_pipeline():
def test_extractive_qa_pipeline(tmp_path):
# Create the pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
Expand Down Expand Up @@ -123,3 +126,11 @@ def test_extractive_qa_pipeline():
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)
23 changes: 21 additions & 2 deletions e2e/pipelines/test_eval_rag_pipelines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

from haystack import Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
Expand All @@ -8,9 +10,10 @@
from haystack.dataclasses import Document
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
from haystack.evaluation.metrics import Metric


def test_bm25_rag_pipeline():
def test_bm25_rag_pipeline(tmp_path):
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
Expand Down Expand Up @@ -68,8 +71,16 @@ def test_bm25_rag_pipeline():
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)


def test_embedding_retrieval_rag_pipeline():
def test_embedding_retrieval_rag_pipeline(tmp_path):
# Create the RAG pipeline
prompt_template = """
Given these documents, answer the question.\nDocuments:
Expand Down Expand Up @@ -143,3 +154,11 @@ def test_embedding_retrieval_rag_pipeline():
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)
3 changes: 2 additions & 1 deletion haystack/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack.evaluation.eval import EvaluationResult, eval
from haystack.evaluation.metrics import Metric, MetricsResult

__all__ = ["eval", "EvaluationResult"]
__all__ = ["eval", "EvaluationResult", "Metric", "MetricsResult"]
55 changes: 50 additions & 5 deletions haystack/evaluation/eval.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Any, Dict, List, Union
from typing import Any, Callable, Dict, List, Union

from haystack import Pipeline
from haystack.core.component import Component
from haystack.evaluation.metrics import Metric, MetricsResult


class EvaluationResult:
"""
EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs.
EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or
component), inputs, outputs, and expected outputs.
The EvaluationResult keeps track of all the information stored by eval.

:param runnable: The runnable (Pipeline or component) used for evaluation.
Expand All @@ -27,6 +29,48 @@ def __init__(
self.outputs = outputs
self.expected_outputs = expected_outputs

# Mapping of metrics to their corresponding functions.
# This should be kept in sync with the Metric enum
self._supported_metrics = {
Metric.RECALL: self._calculate_recall,
Metric.MRR: self._calculate_mrr,
Metric.MAP: self._calculate_map,
Metric.F1: self._calculate_f1,
Metric.EM: self._calculate_em,
Metric.SAS: self._calculate_sas,
}

def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
"""
Calculate evaluation metrics based on the provided Metric or using the custom metric function.

:param metric: The Metric indicating the type of metric to calculate or custom function to compute.
:return: MetricsResult containing the calculated metric.
"""

if isinstance(metric, Metric):
return self._supported_metrics[metric](**kwargs)

return metric(self, **kwargs)

def _calculate_recall(self):
return MetricsResult({"recall": None})

def _calculate_map(self):
return MetricsResult({"mean_average_precision": None})

def _calculate_mrr(self):
return MetricsResult({"mean_reciprocal_rank": None})

def _calculate_f1(self):
return MetricsResult({"f1": None})

def _calculate_em(self):
return MetricsResult({"exact_match": 1.0})

def _calculate_sas(self):
return MetricsResult({"exact_match": None})


def eval(
runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
Expand All @@ -41,16 +85,17 @@ def eval(
:param inputs: List of inputs used for evaluation.
:param expected_outputs: List of expected outputs used for evaluation.

:return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs.
:return: An instance of EvaluationResult containing information about the evaluation, including the runnable,
inputs, outputs, and expected outputs.
"""

outputs = []

# Check that expected outputs has the correct shape
if len(inputs) != len(expected_outputs):
raise ValueError(
f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). "
" Please ensure that each input has a corresponding expected output."
f"The number of inputs ({len(inputs)}) does not match the number of expected outputs "
f"({len(expected_outputs)}). Please ensure that each input has a corresponding expected output."
)

for input_ in inputs:
Expand Down
32 changes: 32 additions & 0 deletions haystack/evaluation/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
from enum import Enum
from pathlib import Path
from typing import Union


class Metric(Enum):
"""
Contains a list of standard metrics available.
"""

RECALL = "Recall"
MRR = "Mean Reciprocal Rank"
MAP = "Mean Average Precision"
F1 = "F1"
EM = "Exact Match"
SAS = "Semantic Answer Similarity"


class MetricsResult(dict):
"""
Stores the metric values computed during the evaluation.
"""

def save(self, file: Union[str, Path]):
"""
Save the metrics stored in the MetricsResult to a json file.

:param file: The file path or file name to save the data.
"""
with open(file, "w") as outfile:
json.dump(self, outfile, indent=4)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
features:
- |
Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics.
Adds `Metric` class to store list of available metrics.
Adds `MetricsResult` class to store the metric values computed during the evaluation.
42 changes: 42 additions & 0 deletions test/evaluation/test_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from unittest.mock import MagicMock

from haystack.core.pipeline import Pipeline
from haystack.evaluation.eval import EvaluationResult
from haystack.evaluation.metrics import Metric


class TestEvaluationResult:
def test_init(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

assert result.runnable == runnable
assert result.inputs == []
assert result.outputs == []
assert result.expected_outputs == []

def test_supported_metrics_contains_all_metrics(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

supported_metrics = [m.name for m in result._supported_metrics.keys()]
all_metric_names = [m.name for m in Metric]
assert supported_metrics == all_metric_names

def test_calculate_metrics_with_supported_metric(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
result._supported_metrics[Metric.RECALL] = MagicMock()
result.calculate_metrics(metric=Metric.RECALL)

assert result._supported_metrics[Metric.RECALL].called_once_with()

def test_calculate_metrics_with_non_supported_metric(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

unsupported_metric = MagicMock()

result.calculate_metrics(metric=unsupported_metric, some_argument="some_value")

assert unsupported_metric.called_once_with(some_argument="some_value")