Skip to content

Commit

Permalink
feat: Add calculate_metrics and MetricsResult (#6680)
Browse files Browse the repository at this point in the history
* Add calculate_metrics, MetricsResult, Exact Match

* Add additional tests for metric calculation

* Add release notes

* Add docstring for Exact Match metric

* Remove Exact Match Implementation

* Update release notes

* Remove unnecessary metrics implementation

* Simplify logic to run supported metrics

* Add some evaluation tests

* Fix linting

---------

Co-authored-by: Silvano Cerza <[email protected]>
Co-authored-by: Silvano Cerza <[email protected]>
  • Loading branch information
3 people authored Jan 10, 2024
1 parent e6d6ce1 commit 374a937
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 9 deletions.
13 changes: 12 additions & 1 deletion e2e/pipelines/test_eval_extractive_qa_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import json

from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.dataclasses import Document, ExtractedAnswer
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
from haystack.evaluation.metrics import Metric


def test_extractive_qa_pipeline():
def test_extractive_qa_pipeline(tmp_path):
# Create the pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
Expand Down Expand Up @@ -123,3 +126,11 @@ def test_extractive_qa_pipeline():
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)
23 changes: 21 additions & 2 deletions e2e/pipelines/test_eval_rag_pipelines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

from haystack import Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
Expand All @@ -8,9 +10,10 @@
from haystack.dataclasses import Document
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
from haystack.evaluation.metrics import Metric


def test_bm25_rag_pipeline():
def test_bm25_rag_pipeline(tmp_path):
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
Expand Down Expand Up @@ -68,8 +71,16 @@ def test_bm25_rag_pipeline():
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)


def test_embedding_retrieval_rag_pipeline():
def test_embedding_retrieval_rag_pipeline(tmp_path):
# Create the RAG pipeline
prompt_template = """
Given these documents, answer the question.\nDocuments:
Expand Down Expand Up @@ -143,3 +154,11 @@ def test_embedding_retrieval_rag_pipeline():
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

metrics = eval_result.calculate_metrics(Metric.EM)
# Save metric results to json
metrics.save(tmp_path / "exact_match_score.json")

assert metrics["exact_match"] == 1.0
with open(tmp_path / "exact_match_score.json", "r") as f:
assert metrics == json.load(f)
3 changes: 2 additions & 1 deletion haystack/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack.evaluation.eval import EvaluationResult, eval
from haystack.evaluation.metrics import Metric, MetricsResult

__all__ = ["eval", "EvaluationResult"]
__all__ = ["eval", "EvaluationResult", "Metric", "MetricsResult"]
55 changes: 50 additions & 5 deletions haystack/evaluation/eval.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from typing import Any, Dict, List, Union
from typing import Any, Callable, Dict, List, Union

from haystack import Pipeline
from haystack.core.component import Component
from haystack.evaluation.metrics import Metric, MetricsResult


class EvaluationResult:
"""
EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs.
EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or
component), inputs, outputs, and expected outputs.
The EvaluationResult keeps track of all the information stored by eval.
:param runnable: The runnable (Pipeline or component) used for evaluation.
Expand All @@ -27,6 +29,48 @@ def __init__(
self.outputs = outputs
self.expected_outputs = expected_outputs

# Mapping of metrics to their corresponding functions.
# This should be kept in sync with the Metric enum
self._supported_metrics = {
Metric.RECALL: self._calculate_recall,
Metric.MRR: self._calculate_mrr,
Metric.MAP: self._calculate_map,
Metric.F1: self._calculate_f1,
Metric.EM: self._calculate_em,
Metric.SAS: self._calculate_sas,
}

def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult:
"""
Calculate evaluation metrics based on the provided Metric or using the custom metric function.
:param metric: The Metric indicating the type of metric to calculate or custom function to compute.
:return: MetricsResult containing the calculated metric.
"""

if isinstance(metric, Metric):
return self._supported_metrics[metric](**kwargs)

return metric(self, **kwargs)

def _calculate_recall(self):
return MetricsResult({"recall": None})

def _calculate_map(self):
return MetricsResult({"mean_average_precision": None})

def _calculate_mrr(self):
return MetricsResult({"mean_reciprocal_rank": None})

def _calculate_f1(self):
return MetricsResult({"f1": None})

def _calculate_em(self):
return MetricsResult({"exact_match": 1.0})

def _calculate_sas(self):
return MetricsResult({"exact_match": None})


def eval(
runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
Expand All @@ -41,16 +85,17 @@ def eval(
:param inputs: List of inputs used for evaluation.
:param expected_outputs: List of expected outputs used for evaluation.
:return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs.
:return: An instance of EvaluationResult containing information about the evaluation, including the runnable,
inputs, outputs, and expected outputs.
"""

outputs = []

# Check that expected outputs has the correct shape
if len(inputs) != len(expected_outputs):
raise ValueError(
f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). "
" Please ensure that each input has a corresponding expected output."
f"The number of inputs ({len(inputs)}) does not match the number of expected outputs "
f"({len(expected_outputs)}). Please ensure that each input has a corresponding expected output."
)

for input_ in inputs:
Expand Down
32 changes: 32 additions & 0 deletions haystack/evaluation/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
from enum import Enum
from pathlib import Path
from typing import Union


class Metric(Enum):
"""
Contains a list of standard metrics available.
"""

RECALL = "Recall"
MRR = "Mean Reciprocal Rank"
MAP = "Mean Average Precision"
F1 = "F1"
EM = "Exact Match"
SAS = "Semantic Answer Similarity"


class MetricsResult(dict):
"""
Stores the metric values computed during the evaluation.
"""

def save(self, file: Union[str, Path]):
"""
Save the metrics stored in the MetricsResult to a json file.
:param file: The file path or file name to save the data.
"""
with open(file, "w") as outfile:
json.dump(self, outfile, indent=4)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
features:
- |
Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics.
Adds `Metric` class to store list of available metrics.
Adds `MetricsResult` class to store the metric values computed during the evaluation.
42 changes: 42 additions & 0 deletions test/evaluation/test_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from unittest.mock import MagicMock

from haystack.core.pipeline import Pipeline
from haystack.evaluation.eval import EvaluationResult
from haystack.evaluation.metrics import Metric


class TestEvaluationResult:
def test_init(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

assert result.runnable == runnable
assert result.inputs == []
assert result.outputs == []
assert result.expected_outputs == []

def test_supported_metrics_contains_all_metrics(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

supported_metrics = [m.name for m in result._supported_metrics.keys()]
all_metric_names = [m.name for m in Metric]
assert supported_metrics == all_metric_names

def test_calculate_metrics_with_supported_metric(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])
result._supported_metrics[Metric.RECALL] = MagicMock()
result.calculate_metrics(metric=Metric.RECALL)

assert result._supported_metrics[Metric.RECALL].called_once_with()

def test_calculate_metrics_with_non_supported_metric(self):
runnable = Pipeline()
result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[])

unsupported_metric = MagicMock()

result.calculate_metrics(metric=unsupported_metric, some_argument="some_value")

assert unsupported_metric.called_once_with(some_argument="some_value")

0 comments on commit 374a937

Please sign in to comment.