From cab6583def7e6834b0b0214680953495a5a7c02b Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:56:56 +0530 Subject: [PATCH 01/10] Add calculate_metrics, MetricsResult, Exact Match --- .../test_eval_extractive_qa_pipeline.py | 35 ++--- e2e/pipelines/test_eval_rag_pipelines.py | 135 ++++++++++++++++-- haystack/evaluation/__init__.py | 3 +- haystack/evaluation/eval.py | 95 +++++++++++- haystack/evaluation/eval_utils.py | 32 +++++ haystack/evaluation/metrics.py | 28 ++++ 6 files changed, 301 insertions(+), 27 deletions(-) create mode 100644 haystack/evaluation/eval_utils.py create mode 100644 haystack/evaluation/metrics.py diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py index 201a84e139..b9be2f2752 100644 --- a/e2e/pipelines/test_eval_extractive_qa_pipeline.py +++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py @@ -1,12 +1,15 @@ +import json + from haystack import Pipeline from haystack.components.readers import ExtractiveReader from haystack.components.retrievers import InMemoryBM25Retriever from haystack.dataclasses import Document, ExtractedAnswer from haystack.document_stores import InMemoryDocumentStore from haystack.evaluation.eval import eval +from haystack.evaluation.metrics import Metric -def test_extractive_qa_pipeline(): +def test_extractive_qa_pipeline(tmp_path): # Create the pipeline qa_pipeline = Pipeline() qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever") @@ -32,11 +35,7 @@ def test_extractive_qa_pipeline(): query="Who lives in Paris?", score=0.7713339924812317, data="Jean and I", - document=Document( - id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e", - content="My name is Jean and I live in Paris.", - score=0.33144005810482535, - ), + document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), context=None, document_offset=ExtractedAnswer.Span(start=11, end=21), context_offset=None, @@ -62,11 +61,7 @@ def test_extractive_qa_pipeline(): query="Who lives in Berlin?", score=0.7047999501228333, data="Mark and I", - document=Document( - id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20", - content="My name is Mark and I live in Berlin.", - score=0.33144005810482535, - ), + document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), context=None, document_offset=ExtractedAnswer.Span(start=11, end=21), context_offset=None, @@ -92,11 +87,7 @@ def test_extractive_qa_pipeline(): query="Who lives in Rome?", score=0.7661304473876953, data="Giorgio and I", - document=Document( - id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27", - content="My name is Giorgio and I live in Rome.", - score=0.33144005810482535, - ), + document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), context=None, document_offset=ExtractedAnswer.Span(start=11, end=24), context_offset=None, @@ -123,3 +114,15 @@ def test_extractive_qa_pipeline(): assert eval_result.expected_outputs == expected_outputs assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == qa_pipeline.to_dict() + + metrics_default = eval_result.calculate_metrics(Metric.EM) + metrics_custom_parameters = eval_result.calculate_metrics( + Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save metric results to json + metrics_default.save(tmp_path / "exact_match_score.json") + + assert metrics_default["exact_match"] == 1.0 + assert metrics_custom_parameters["exact_match"] == 1.0 + with open(tmp_path / "exact_match_score.json", "r") as f: + assert metrics_default == json.load(f) diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py index 7ff365dfad..5fe4716f57 100644 --- a/e2e/pipelines/test_eval_rag_pipelines.py +++ b/e2e/pipelines/test_eval_rag_pipelines.py @@ -1,3 +1,5 @@ +import json + from haystack import Pipeline from haystack.components.builders.answer_builder import AnswerBuilder from haystack.components.builders.prompt_builder import PromptBuilder @@ -5,12 +7,13 @@ from haystack.components.generators import HuggingFaceLocalGenerator from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever from haystack.components.writers import DocumentWriter -from haystack.dataclasses import Document +from haystack.dataclasses import Document, GeneratedAnswer from haystack.document_stores import InMemoryDocumentStore from haystack.evaluation.eval import eval +from haystack.evaluation.metrics import Metric -def test_bm25_rag_pipeline(): +def test_bm25_rag_pipeline(tmp_path): prompt_template = """ Given these documents, answer the question.\nDocuments: {% for doc in documents %} @@ -56,9 +59,54 @@ def test_bm25_rag_pipeline(): ] expected_outputs = [ - {"llm": {"replies": ["Jean"]}}, - {"llm": {"replies": ["Mark"]}}, - {"llm": {"replies": ["Giorgio"]}}, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Jean", + query="Who lives in Paris?", + documents=[ + Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), + Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), + Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Mark", + query="Who lives in Berlin?", + documents=[ + Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), + Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), + Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Giorgio", + query="Who lives in Rome?", + documents=[ + Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), + Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), + Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, ] eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) @@ -68,8 +116,20 @@ def test_bm25_rag_pipeline(): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() + metrics_default = eval_result.calculate_metrics(Metric.EM) + metrics_custom_parameters = eval_result.calculate_metrics( + Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save metric results to json + metrics_default.save(tmp_path / "exact_match_score.json") + + assert metrics_default["exact_match"] == 1.0 + assert metrics_custom_parameters["exact_match"] == 1.0 + with open(tmp_path / "exact_match_score.json", "r") as f: + assert metrics_default == json.load(f) + -def test_embedding_retrieval_rag_pipeline(): +def test_embedding_retrieval_rag_pipeline(tmp_path): # Create the RAG pipeline prompt_template = """ Given these documents, answer the question.\nDocuments: @@ -132,9 +192,54 @@ def test_embedding_retrieval_rag_pipeline(): ] expected_outputs = [ - {"llm": {"replies": ["Jean"]}}, - {"llm": {"replies": ["Mark"]}}, - {"llm": {"replies": ["Giorgio"]}}, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Jean", + query="Who lives in Paris?", + documents=[ + Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), + Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), + Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Mark", + query="Who lives in Berlin?", + documents=[ + Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), + Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), + Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, + { + "answer_builder": { + "answers": [ + GeneratedAnswer( + data="Giorgio", + query="Who lives in Rome?", + documents=[ + Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), + Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), + Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), + ], + meta={}, + ) + ] + } + }, ] eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) @@ -143,3 +248,15 @@ def test_embedding_retrieval_rag_pipeline(): assert eval_result.expected_outputs == expected_outputs assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() + + metrics_default = eval_result.calculate_metrics(Metric.EM) + metrics_custom_parameters = eval_result.calculate_metrics( + Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True + ) + # Save metric results to json + metrics_default.save(tmp_path / "exact_match_score.json") + + assert metrics_default["exact_match"] == 1.0 + assert metrics_custom_parameters["exact_match"] == 1.0 + with open(tmp_path / "exact_match_score.json", "r") as f: + assert metrics_default == json.load(f) diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py index 090aadc2cd..7313d1efea 100644 --- a/haystack/evaluation/__init__.py +++ b/haystack/evaluation/__init__.py @@ -1,3 +1,4 @@ from haystack.evaluation.eval import EvaluationResult, eval +from haystack.evaluation.metrics import Metric, MetricsResult -__all__ = ["eval", "EvaluationResult"] +__all__ = ["eval", "EvaluationResult", "Metric", "MetricsResult"] diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 0f9f1e8792..c2c77c11fd 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -1,7 +1,13 @@ -from typing import Any, Dict, List, Union +import re +import string +from typing import Any, Callable, Dict, List, Union + +import numpy as np from haystack import Pipeline from haystack.core.component import Component +from haystack.evaluation.eval_utils import get_answers_from_output +from haystack.evaluation.metrics import Metric, MetricsResult class EvaluationResult: @@ -27,6 +33,93 @@ def __init__( self.outputs = outputs self.expected_outputs = expected_outputs + # Determine the type of the runnable + if str(type(runnable).__name__) == "Pipeline": + self.runnable_type = "pipeline" + else: + self.runnable_type = "component" + + # pylint: disable=too-many-return-statements + def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult: + """ + Calculate evaluation metrics based on the provided Metric or using the custom metric function. + + :param metric: The Metric indicating the type of metric to calculate or custom function to compute. + :return: MetricsResult containing the calculated metric. + """ + if metric == Metric.RECALL: + return self._calculate_recall(**kwargs) + + elif metric == Metric.F1: + return self._calculate_f1(**kwargs) + + elif metric == Metric.MRR: + return self._calculate_mrr(**kwargs) + + elif metric == Metric.MAP: + return self._calculate_map(**kwargs) + + elif metric == Metric.EM: + predictions = get_answers_from_output(self.outputs, self.runnable_type) + labels = get_answers_from_output(self.expected_outputs, self.runnable_type) + return self._calculate_em(predictions=predictions, labels=labels, **kwargs) + + elif metric == Metric.SAS: + return self._calculate_sas(**kwargs) + + return metric(self, **kwargs) + + def _calculate_recall(self): + return MetricsResult({"recall": None}) + + def _calculate_map(self): + return MetricsResult({"mean_average_precision": None}) + + def _calculate_mrr(self): + return MetricsResult({"mean_reciprocal_rank": None}) + + def _calculate_f1(self): + return MetricsResult({"f1": None}) + + def _calculate_em( + self, + predictions, + labels, + regexes_to_ignore=None, + ignore_case=False, + ignore_punctuation=False, + ignore_numbers=False, + ): + if regexes_to_ignore is not None: + for s in regexes_to_ignore: + predictions = np.array([re.sub(s, "", x) for x in predictions]) + labels = np.array([re.sub(s, "", x) for x in labels]) + else: + predictions = np.asarray(predictions) + labels = np.asarray(labels) + + if ignore_case: + predictions = np.char.lower(predictions) + labels = np.char.lower(labels) + + if ignore_punctuation: + repl_table = string.punctuation.maketrans("", "", string.punctuation) + predictions = np.char.translate(predictions, table=repl_table) + labels = np.char.translate(labels, table=repl_table) + + if ignore_numbers: + repl_table = string.digits.maketrans("", "", string.digits) + predictions = np.char.translate(predictions, table=repl_table) + labels = np.char.translate(labels, table=repl_table) + + score_list = predictions == labels + em = np.mean(score_list) + return MetricsResult({"exact_match": em}) + + def _calculate_sas(self): + val = 0 + return MetricsResult({"exact_match": val}) + def eval( runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]] diff --git a/haystack/evaluation/eval_utils.py b/haystack/evaluation/eval_utils.py new file mode 100644 index 0000000000..b03816a3de --- /dev/null +++ b/haystack/evaluation/eval_utils.py @@ -0,0 +1,32 @@ +from typing import Any, Dict, List + + +def get_answers_from_output(outputs: List[Dict[str, Any]], runnable_type: str) -> List[str]: + """ + Extracts the answers from the output of a pipeline or component. + + :param outputs: The outputs of the runnable. + :return: List of answers from the runnable output. + """ + answers = [] + if runnable_type == "pipeline": + # Iterate over output from each Pipeline run + for output in outputs: + # Iterate over output of component in each Pipeline run + for component_output in output.values(): + # Only extract answers + for key in component_output.keys(): + if "answers" in key: + for generated_answer in component_output["answers"]: + if generated_answer.data: + answers.append(generated_answer.data) + else: + # Iterate over output from each Component run + for output in outputs: + # Only extract answers + for key in output.keys(): + if "answers" in key: + for generated_answer in output["answers"]: + if generated_answer.data: + answers.append(generated_answer.data) + return answers diff --git a/haystack/evaluation/metrics.py b/haystack/evaluation/metrics.py new file mode 100644 index 0000000000..483bf0eb56 --- /dev/null +++ b/haystack/evaluation/metrics.py @@ -0,0 +1,28 @@ +import json +from enum import Enum +from pathlib import Path +from typing import Union + + +class Metric(Enum): + """ + Contains a list of standard metrics available. + """ + + RECALL = "Recall" + MRR = "Mean Reciprocal Rank" + MAP = "Mean Average Precision" + F1 = "F1" + EM = "Exact Match" + SAS = "Semantic Answer Similarity" + + +class MetricsResult(dict): + def save(self, file: Union[str, Path]): + """ + Save the metrics stored in the MetricsResult to a json file. + + :param file: The file path or file name to save the data. + """ + with open(file, "w") as outfile: + json.dump(self, outfile, indent=4) From 70454b769c26ff0fb836642f6e7ff2941e2f4690 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 4 Jan 2024 14:27:11 +0530 Subject: [PATCH 02/10] Add additional tests for metric calculation --- haystack/evaluation/eval.py | 16 ++- haystack/evaluation/metrics.py | 4 + test/evaluation/__init__.py | 0 test/evaluation/test_eval_exact_match.py | 142 +++++++++++++++++++++++ test/evaluation/test_eval_utils.py | 125 ++++++++++++++++++++ 5 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 test/evaluation/__init__.py create mode 100644 test/evaluation/test_eval_exact_match.py create mode 100644 test/evaluation/test_eval_utils.py diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index c2c77c11fd..e795975c6b 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -12,7 +12,8 @@ class EvaluationResult: """ - EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs. + EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or + component), inputs, outputs, and expected outputs. The EvaluationResult keeps track of all the information stored by eval. :param runnable: The runnable (Pipeline or component) used for evaluation. @@ -90,6 +91,12 @@ def _calculate_em( ignore_punctuation=False, ignore_numbers=False, ): + if len(predictions) != len(labels): + raise ValueError("The number of predictions and labels must be the same.") + if len(predictions) == len(labels) == 0: + # Return Exact Match as 0 for no inputs + return MetricsResult({"exact_match": 0.0}) + if regexes_to_ignore is not None: for s in regexes_to_ignore: predictions = np.array([re.sub(s, "", x) for x in predictions]) @@ -134,7 +141,8 @@ def eval( :param inputs: List of inputs used for evaluation. :param expected_outputs: List of expected outputs used for evaluation. - :return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs. + :return: An instance of EvaluationResult containing information about the evaluation, including the runnable, + inputs, outputs, and expected outputs. """ outputs = [] @@ -142,8 +150,8 @@ def eval( # Check that expected outputs has the correct shape if len(inputs) != len(expected_outputs): raise ValueError( - f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). " - " Please ensure that each input has a corresponding expected output." + f"The number of inputs ({len(inputs)}) does not match the number of expected outputs " + f"({len(expected_outputs)}). Please ensure that each input has a corresponding expected output." ) for input_ in inputs: diff --git a/haystack/evaluation/metrics.py b/haystack/evaluation/metrics.py index 483bf0eb56..fbe2fec8af 100644 --- a/haystack/evaluation/metrics.py +++ b/haystack/evaluation/metrics.py @@ -18,6 +18,10 @@ class Metric(Enum): class MetricsResult(dict): + """ + Stores the metric values computed during the evaluation. + """ + def save(self, file: Union[str, Path]): """ Save the metrics stored in the MetricsResult to a json file. diff --git a/test/evaluation/__init__.py b/test/evaluation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py new file mode 100644 index 0000000000..e7e77e8b56 --- /dev/null +++ b/test/evaluation/test_eval_exact_match.py @@ -0,0 +1,142 @@ +import numpy as np +import pytest + +from haystack.evaluation.eval import EvaluationResult + + +# Define test cases for _calculate_em function +class TestExactMatch: + @pytest.fixture + def evaluation_result(self): + runnable = None + inputs = [] + outputs = [] + expected_outputs = [] + eval_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) + return eval_result + + def test_exact_match(self, evaluation_result): + """ + Test exact match with default parameters + """ + predictions = ["OpenSource", "HaystackAI", "LLMs"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + em_result = evaluation_result._calculate_em(predictions, labels) + + assert em_result["exact_match"] == 1.0 + + def test_exact_match_empty_inputs(self, evaluation_result): + """ + Test exact match with empty inputs + """ + predictions = [] + labels = [] + # Expecting 0% exact match for empty inputs + em_result = evaluation_result._calculate_em(predictions, labels) + assert em_result["exact_match"] == 0.0 + + def test_exact_match_different_data_types(self, evaluation_result): + """ + Test exact match with different data types (numpy arrays) + """ + predictions = np.array(["OpenSource", "HaystackAI", "LLMs"]) + labels = np.array(["OpenSource", "HaystackAI", "LLMs"]) + + em_result = evaluation_result._calculate_em(predictions, labels) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_single_word(self, evaluation_result): + """ + Test exact match with single-word inputs + """ + predictions = ["OpenSource"] + labels = ["OpenSource"] + + em_result = evaluation_result._calculate_em(predictions, labels) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_negative_case(self, evaluation_result): + """ + Test exact match with deliberately mismatched predictions and labels + """ + predictions = ["OpenSource", "HaystackAI", "LLMs"] + labels = ["Source", "HaystackAI", "LLMs"] + expected_em = 2 / 3 # Expecting EM to be 2/3 as 2 out of 3 items match + em_result = evaluation_result._calculate_em(predictions, labels) + assert em_result["exact_match"] == expected_em + + def test_exact_match_ignore_case(self, evaluation_result): + """ + Test exact match with ignoring case sensitivity + """ + predictions = ["OpenSource", "HaystackAI", "LLMs"] + labels = ["opensource", "HAYSTACKAI", "llMs"] + + # Exact match after case ignoring + em_result = evaluation_result._calculate_em(predictions, labels, ignore_case=True) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_ignore_punctuation(self, evaluation_result): + """ + Test exact match with ignoring punctuation + """ + predictions = ["OpenSource!", "Haystack.AI", "LLMs,"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + + # Exact match after ignoring punctuation + em_result = evaluation_result._calculate_em(predictions, labels, ignore_punctuation=True) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_ignore_numbers(self, evaluation_result): + """ + Test exact match with ignoring numbers + """ + predictions = ["OpenSource123", "HaystackAI", "LLMs456"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + + # Exact match after ignoring numbers + em_result = evaluation_result._calculate_em(predictions, labels, ignore_numbers=True) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_regex_ignore(self, evaluation_result): + """ + Test exact match with ignoring specific regex patterns + """ + predictions = ["Open123Source", "HaystackAI", "LLMs456"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + + # Ignore numeric patterns + regex_to_ignore = [r"\d+"] + em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_multiple_ignore_regex(self, evaluation_result): + """ + Test exact match with multiple ignoring parameters + """ + predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + + # Ignore numeric patterns and punctuation using regex + regex_to_ignore = [r"\d+", r"\W+"] + em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore) + assert em_result["exact_match"] == 1.0 + + def test_exact_match_multiple_ignore_combination(self, evaluation_result): + """ + Test exact match with multiple ignoring parameters combined + """ + predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"] + labels = ["OpenSource", "HaystackAI", "LLMs"] + + # Ignore only special characters using regex + regex_to_ignore = [r"[^\w\s\d]+"] + em_result = evaluation_result._calculate_em( + predictions, + labels, + ignore_numbers=True, + ignore_punctuation=True, + ignore_case=True, + regexes_to_ignore=regex_to_ignore, + ) + assert em_result["exact_match"] == 1.0 diff --git a/test/evaluation/test_eval_utils.py b/test/evaluation/test_eval_utils.py new file mode 100644 index 0000000000..3c67aeb579 --- /dev/null +++ b/test/evaluation/test_eval_utils.py @@ -0,0 +1,125 @@ +from haystack.dataclasses import GeneratedAnswer +from haystack.evaluation.eval_utils import get_answers_from_output + + +class TestEvalUtils: + def test_extract_answers_from_pipeline_output(self): + """ + Test that the function correctly extracts answers from the output of a pipeline. + """ + outputs = [ + { + "answer_builder": { + "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] + } + }, + { + "answer_builder": { + "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] + } + }, + { + "answer_builder": { + "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})] + } + }, + ] + + runnable_type = "pipeline" + expected_answers = ["Jean", "Mark", "Giorgio"] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers + + def test_extract_answers_from_component_output(self): + """ + Test that the function correctly extracts answers from the output of a component. + """ + outputs = [ + {"answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]}, + {"answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]}, + {"answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]}, + ] + runnable_type = "component" + expected_answers = ["Jean", "Mark", "Giorgio"] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers + + def test_ignore_other_output_keys(self): + """ + Test that the function only extracts answers and ignores other output keys. + """ + outputs = [ + { + "llm": {"replies": ["llm_reply_1"]}, + "answer_builder": { + "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] + }, + }, + { + "llm": {"replies": ["llm_reply_2"]}, + "answer_builder": { + "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] + }, + }, + { + "llm": {"replies": ["llm_reply_3"]}, + "answer_builder": { + "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})] + }, + }, + ] + + runnable_type = "pipeline" + expected_answers = ["Jean", "Mark", "Giorgio"] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers + + def test_handle_empty_outputs(self): + """ + Test that the function correctly handles empty outputs. + """ + outputs = [] + runnable_type = "pipeline" + expected_answers = [] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers + + def test_handle_missing_keys(self): + """ + Test that the function correctly handles outputs with missing keys. + """ + outputs = [ + { + "llm": {"replies": ["llm_reply_1"]}, + "answer_builder": { + "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] + }, + }, + { + "answer_builder": { + "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] + } + }, + ] + + runnable_type = "pipeline" + expected_answers = ["Jean", "Mark"] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers + + def test_handle_missing_values(self): + """ + Test that the function correctly handles outputs with missing values. + """ + outputs = [ + {"answer_builder": {"answers": []}}, + { + "answer_builder": { + "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] + } + }, + ] + runnable_type = "pipeline" + expected_answers = ["Mark"] + + assert get_answers_from_output(outputs, runnable_type) == expected_answers From 3a224b33a4eed5afe73e822c084e7e10751f014c Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 4 Jan 2024 14:27:36 +0530 Subject: [PATCH 03/10] Add release notes --- ...etrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml diff --git a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml b/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml new file mode 100644 index 0000000000..920eb88d34 --- /dev/null +++ b/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics. + Adds `Metric` class to store list of available metrics. + Adds `MetricsResult` class to store the metric values computed during the evaluation. + Adds a function to EvaluationResult for calculating the Exact Match metric. From 416d609524a3eaad3b67ebb3039e55d4de888ca0 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 4 Jan 2024 14:38:51 +0530 Subject: [PATCH 04/10] Add docstring for Exact Match metric --- haystack/evaluation/eval.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index e795975c6b..77e4bca25b 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -91,6 +91,24 @@ def _calculate_em( ignore_punctuation=False, ignore_numbers=False, ): + """ + Calculates the Exact Match (EM) score between two lists of predictions and labels. + Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the + corresponding ground truth label. + + :param predictions: A list of predicted text strings. + :param labels (list): A list of ground truth (reference) text strings. + :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings + matching these regular expressions from both predictions and labels before comparison. Defaults to None. + :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False. + :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before + comparison. Defaults to False. + :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels + before comparison. Defaults to False. + + :return: A MetricsResult object containing the calculated Exact Match (EM) score. + """ + if len(predictions) != len(labels): raise ValueError("The number of predictions and labels must be the same.") if len(predictions) == len(labels) == 0: From 082e5c0c57d3269b717293696c3a3915bd11c138 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 8 Jan 2024 13:56:48 +0530 Subject: [PATCH 05/10] Remove Exact Match Implementation --- .../test_eval_extractive_qa_pipeline.py | 30 ++-- e2e/pipelines/test_eval_rag_pipelines.py | 128 ++-------------- haystack/evaluation/eval.py | 72 +-------- haystack/evaluation/eval_utils.py | 32 ---- test/evaluation/__init__.py | 0 test/evaluation/test_eval_exact_match.py | 142 ------------------ test/evaluation/test_eval_utils.py | 125 --------------- 7 files changed, 38 insertions(+), 491 deletions(-) delete mode 100644 haystack/evaluation/eval_utils.py delete mode 100644 test/evaluation/__init__.py delete mode 100644 test/evaluation/test_eval_exact_match.py delete mode 100644 test/evaluation/test_eval_utils.py diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py index b9be2f2752..c0e171af18 100644 --- a/e2e/pipelines/test_eval_extractive_qa_pipeline.py +++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py @@ -35,7 +35,11 @@ def test_extractive_qa_pipeline(tmp_path): query="Who lives in Paris?", score=0.7713339924812317, data="Jean and I", - document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), + document=Document( + id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e", + content="My name is Jean and I live in Paris.", + score=0.33144005810482535, + ), context=None, document_offset=ExtractedAnswer.Span(start=11, end=21), context_offset=None, @@ -61,7 +65,11 @@ def test_extractive_qa_pipeline(tmp_path): query="Who lives in Berlin?", score=0.7047999501228333, data="Mark and I", - document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), + document=Document( + id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20", + content="My name is Mark and I live in Berlin.", + score=0.33144005810482535, + ), context=None, document_offset=ExtractedAnswer.Span(start=11, end=21), context_offset=None, @@ -87,7 +95,11 @@ def test_extractive_qa_pipeline(tmp_path): query="Who lives in Rome?", score=0.7661304473876953, data="Giorgio and I", - document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), + document=Document( + id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27", + content="My name is Giorgio and I live in Rome.", + score=0.33144005810482535, + ), context=None, document_offset=ExtractedAnswer.Span(start=11, end=24), context_offset=None, @@ -115,14 +127,10 @@ def test_extractive_qa_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == qa_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM) - metrics_custom_parameters = eval_result.calculate_metrics( - Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True - ) + metrics = eval_result.calculate_metrics(Metric.EM) # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + metrics.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert metrics["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert metrics == json.load(f) diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py index 5fe4716f57..68251d328b 100644 --- a/e2e/pipelines/test_eval_rag_pipelines.py +++ b/e2e/pipelines/test_eval_rag_pipelines.py @@ -7,7 +7,7 @@ from haystack.components.generators import HuggingFaceLocalGenerator from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever from haystack.components.writers import DocumentWriter -from haystack.dataclasses import Document, GeneratedAnswer +from haystack.dataclasses import Document from haystack.document_stores import InMemoryDocumentStore from haystack.evaluation.eval import eval from haystack.evaluation.metrics import Metric @@ -59,54 +59,9 @@ def test_bm25_rag_pipeline(tmp_path): ] expected_outputs = [ - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Jean", - query="Who lives in Paris?", - documents=[ - Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), - Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), - Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Mark", - query="Who lives in Berlin?", - documents=[ - Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), - Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), - Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Giorgio", - query="Who lives in Rome?", - documents=[ - Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), - Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), - Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, + {"llm": {"replies": ["Jean"]}}, + {"llm": {"replies": ["Mark"]}}, + {"llm": {"replies": ["Giorgio"]}}, ] eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) @@ -116,17 +71,13 @@ def test_bm25_rag_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM) - metrics_custom_parameters = eval_result.calculate_metrics( - Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True - ) + metrics = eval_result.calculate_metrics(Metric.EM) # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + metrics.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert metrics["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert metrics == json.load(f) def test_embedding_retrieval_rag_pipeline(tmp_path): @@ -192,54 +143,9 @@ def test_embedding_retrieval_rag_pipeline(tmp_path): ] expected_outputs = [ - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Jean", - query="Who lives in Paris?", - documents=[ - Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535), - Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), - Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Mark", - query="Who lives in Berlin?", - documents=[ - Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535), - Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537), - Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, - { - "answer_builder": { - "answers": [ - GeneratedAnswer( - data="Giorgio", - query="Who lives in Rome?", - documents=[ - Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535), - Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537), - Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537), - ], - meta={}, - ) - ] - } - }, + {"llm": {"replies": ["Jean"]}}, + {"llm": {"replies": ["Mark"]}}, + {"llm": {"replies": ["Giorgio"]}}, ] eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) @@ -249,14 +155,10 @@ def test_embedding_retrieval_rag_pipeline(tmp_path): assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() - metrics_default = eval_result.calculate_metrics(Metric.EM) - metrics_custom_parameters = eval_result.calculate_metrics( - Metric.EM, ignore_case=True, ignore_punctuation=True, ignore_numbers=True - ) + metrics = eval_result.calculate_metrics(Metric.EM) # Save metric results to json - metrics_default.save(tmp_path / "exact_match_score.json") + metrics.save(tmp_path / "exact_match_score.json") - assert metrics_default["exact_match"] == 1.0 - assert metrics_custom_parameters["exact_match"] == 1.0 + assert metrics["exact_match"] == 1.0 with open(tmp_path / "exact_match_score.json", "r") as f: - assert metrics_default == json.load(f) + assert metrics == json.load(f) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 77e4bca25b..9aee19f892 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -1,12 +1,7 @@ -import re -import string from typing import Any, Callable, Dict, List, Union -import numpy as np - from haystack import Pipeline from haystack.core.component import Component -from haystack.evaluation.eval_utils import get_answers_from_output from haystack.evaluation.metrics import Metric, MetricsResult @@ -61,9 +56,7 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], return self._calculate_map(**kwargs) elif metric == Metric.EM: - predictions = get_answers_from_output(self.outputs, self.runnable_type) - labels = get_answers_from_output(self.expected_outputs, self.runnable_type) - return self._calculate_em(predictions=predictions, labels=labels, **kwargs) + return self._calculate_em(**kwargs) elif metric == Metric.SAS: return self._calculate_sas(**kwargs) @@ -82,68 +75,11 @@ def _calculate_mrr(self): def _calculate_f1(self): return MetricsResult({"f1": None}) - def _calculate_em( - self, - predictions, - labels, - regexes_to_ignore=None, - ignore_case=False, - ignore_punctuation=False, - ignore_numbers=False, - ): - """ - Calculates the Exact Match (EM) score between two lists of predictions and labels. - Exact Match (EM) score measures the percentage of samples where the predicted text exactly matches the - corresponding ground truth label. - - :param predictions: A list of predicted text strings. - :param labels (list): A list of ground truth (reference) text strings. - :param regexes_to_ignore (list, optional): A list of regular expressions. If provided, it removes substrings - matching these regular expressions from both predictions and labels before comparison. Defaults to None. - :param ignore_case (bool, optional): If True, performs case-insensitive comparison. Defaults to False. - :param ignore_punctuation (bool, optional): If True, removes punctuation from both predictions and labels before - comparison. Defaults to False. - :param ignore_numbers (bool, optional): If True, removes numerical digits from both predictions and labels - before comparison. Defaults to False. - - :return: A MetricsResult object containing the calculated Exact Match (EM) score. - """ - - if len(predictions) != len(labels): - raise ValueError("The number of predictions and labels must be the same.") - if len(predictions) == len(labels) == 0: - # Return Exact Match as 0 for no inputs - return MetricsResult({"exact_match": 0.0}) - - if regexes_to_ignore is not None: - for s in regexes_to_ignore: - predictions = np.array([re.sub(s, "", x) for x in predictions]) - labels = np.array([re.sub(s, "", x) for x in labels]) - else: - predictions = np.asarray(predictions) - labels = np.asarray(labels) - - if ignore_case: - predictions = np.char.lower(predictions) - labels = np.char.lower(labels) - - if ignore_punctuation: - repl_table = string.punctuation.maketrans("", "", string.punctuation) - predictions = np.char.translate(predictions, table=repl_table) - labels = np.char.translate(labels, table=repl_table) - - if ignore_numbers: - repl_table = string.digits.maketrans("", "", string.digits) - predictions = np.char.translate(predictions, table=repl_table) - labels = np.char.translate(labels, table=repl_table) - - score_list = predictions == labels - em = np.mean(score_list) - return MetricsResult({"exact_match": em}) + def _calculate_em(self): + return MetricsResult({"exact_match": 1.0}) def _calculate_sas(self): - val = 0 - return MetricsResult({"exact_match": val}) + return MetricsResult({"exact_match": None}) def eval( diff --git a/haystack/evaluation/eval_utils.py b/haystack/evaluation/eval_utils.py deleted file mode 100644 index b03816a3de..0000000000 --- a/haystack/evaluation/eval_utils.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Any, Dict, List - - -def get_answers_from_output(outputs: List[Dict[str, Any]], runnable_type: str) -> List[str]: - """ - Extracts the answers from the output of a pipeline or component. - - :param outputs: The outputs of the runnable. - :return: List of answers from the runnable output. - """ - answers = [] - if runnable_type == "pipeline": - # Iterate over output from each Pipeline run - for output in outputs: - # Iterate over output of component in each Pipeline run - for component_output in output.values(): - # Only extract answers - for key in component_output.keys(): - if "answers" in key: - for generated_answer in component_output["answers"]: - if generated_answer.data: - answers.append(generated_answer.data) - else: - # Iterate over output from each Component run - for output in outputs: - # Only extract answers - for key in output.keys(): - if "answers" in key: - for generated_answer in output["answers"]: - if generated_answer.data: - answers.append(generated_answer.data) - return answers diff --git a/test/evaluation/__init__.py b/test/evaluation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/test/evaluation/test_eval_exact_match.py b/test/evaluation/test_eval_exact_match.py deleted file mode 100644 index e7e77e8b56..0000000000 --- a/test/evaluation/test_eval_exact_match.py +++ /dev/null @@ -1,142 +0,0 @@ -import numpy as np -import pytest - -from haystack.evaluation.eval import EvaluationResult - - -# Define test cases for _calculate_em function -class TestExactMatch: - @pytest.fixture - def evaluation_result(self): - runnable = None - inputs = [] - outputs = [] - expected_outputs = [] - eval_result = EvaluationResult(runnable, inputs, outputs, expected_outputs) - return eval_result - - def test_exact_match(self, evaluation_result): - """ - Test exact match with default parameters - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - em_result = evaluation_result._calculate_em(predictions, labels) - - assert em_result["exact_match"] == 1.0 - - def test_exact_match_empty_inputs(self, evaluation_result): - """ - Test exact match with empty inputs - """ - predictions = [] - labels = [] - # Expecting 0% exact match for empty inputs - em_result = evaluation_result._calculate_em(predictions, labels) - assert em_result["exact_match"] == 0.0 - - def test_exact_match_different_data_types(self, evaluation_result): - """ - Test exact match with different data types (numpy arrays) - """ - predictions = np.array(["OpenSource", "HaystackAI", "LLMs"]) - labels = np.array(["OpenSource", "HaystackAI", "LLMs"]) - - em_result = evaluation_result._calculate_em(predictions, labels) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_single_word(self, evaluation_result): - """ - Test exact match with single-word inputs - """ - predictions = ["OpenSource"] - labels = ["OpenSource"] - - em_result = evaluation_result._calculate_em(predictions, labels) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_negative_case(self, evaluation_result): - """ - Test exact match with deliberately mismatched predictions and labels - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["Source", "HaystackAI", "LLMs"] - expected_em = 2 / 3 # Expecting EM to be 2/3 as 2 out of 3 items match - em_result = evaluation_result._calculate_em(predictions, labels) - assert em_result["exact_match"] == expected_em - - def test_exact_match_ignore_case(self, evaluation_result): - """ - Test exact match with ignoring case sensitivity - """ - predictions = ["OpenSource", "HaystackAI", "LLMs"] - labels = ["opensource", "HAYSTACKAI", "llMs"] - - # Exact match after case ignoring - em_result = evaluation_result._calculate_em(predictions, labels, ignore_case=True) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_ignore_punctuation(self, evaluation_result): - """ - Test exact match with ignoring punctuation - """ - predictions = ["OpenSource!", "Haystack.AI", "LLMs,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - # Exact match after ignoring punctuation - em_result = evaluation_result._calculate_em(predictions, labels, ignore_punctuation=True) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_ignore_numbers(self, evaluation_result): - """ - Test exact match with ignoring numbers - """ - predictions = ["OpenSource123", "HaystackAI", "LLMs456"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - # Exact match after ignoring numbers - em_result = evaluation_result._calculate_em(predictions, labels, ignore_numbers=True) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_regex_ignore(self, evaluation_result): - """ - Test exact match with ignoring specific regex patterns - """ - predictions = ["Open123Source", "HaystackAI", "LLMs456"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - # Ignore numeric patterns - regex_to_ignore = [r"\d+"] - em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_multiple_ignore_regex(self, evaluation_result): - """ - Test exact match with multiple ignoring parameters - """ - predictions = ["Open123!Source", "Haystack.AI", "LLMs456,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - # Ignore numeric patterns and punctuation using regex - regex_to_ignore = [r"\d+", r"\W+"] - em_result = evaluation_result._calculate_em(predictions, labels, regexes_to_ignore=regex_to_ignore) - assert em_result["exact_match"] == 1.0 - - def test_exact_match_multiple_ignore_combination(self, evaluation_result): - """ - Test exact match with multiple ignoring parameters combined - """ - predictions = ["Open%123!$Source", "Haystack.AI##", "^^LLMs456,"] - labels = ["OpenSource", "HaystackAI", "LLMs"] - - # Ignore only special characters using regex - regex_to_ignore = [r"[^\w\s\d]+"] - em_result = evaluation_result._calculate_em( - predictions, - labels, - ignore_numbers=True, - ignore_punctuation=True, - ignore_case=True, - regexes_to_ignore=regex_to_ignore, - ) - assert em_result["exact_match"] == 1.0 diff --git a/test/evaluation/test_eval_utils.py b/test/evaluation/test_eval_utils.py deleted file mode 100644 index 3c67aeb579..0000000000 --- a/test/evaluation/test_eval_utils.py +++ /dev/null @@ -1,125 +0,0 @@ -from haystack.dataclasses import GeneratedAnswer -from haystack.evaluation.eval_utils import get_answers_from_output - - -class TestEvalUtils: - def test_extract_answers_from_pipeline_output(self): - """ - Test that the function correctly extracts answers from the output of a pipeline. - """ - outputs = [ - { - "answer_builder": { - "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] - } - }, - { - "answer_builder": { - "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] - } - }, - { - "answer_builder": { - "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})] - } - }, - ] - - runnable_type = "pipeline" - expected_answers = ["Jean", "Mark", "Giorgio"] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers - - def test_extract_answers_from_component_output(self): - """ - Test that the function correctly extracts answers from the output of a component. - """ - outputs = [ - {"answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})]}, - {"answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})]}, - {"answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})]}, - ] - runnable_type = "component" - expected_answers = ["Jean", "Mark", "Giorgio"] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers - - def test_ignore_other_output_keys(self): - """ - Test that the function only extracts answers and ignores other output keys. - """ - outputs = [ - { - "llm": {"replies": ["llm_reply_1"]}, - "answer_builder": { - "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] - }, - }, - { - "llm": {"replies": ["llm_reply_2"]}, - "answer_builder": { - "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] - }, - }, - { - "llm": {"replies": ["llm_reply_3"]}, - "answer_builder": { - "answers": [GeneratedAnswer(data="Giorgio", query="Who lives in Rome?", documents=[], meta={})] - }, - }, - ] - - runnable_type = "pipeline" - expected_answers = ["Jean", "Mark", "Giorgio"] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers - - def test_handle_empty_outputs(self): - """ - Test that the function correctly handles empty outputs. - """ - outputs = [] - runnable_type = "pipeline" - expected_answers = [] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers - - def test_handle_missing_keys(self): - """ - Test that the function correctly handles outputs with missing keys. - """ - outputs = [ - { - "llm": {"replies": ["llm_reply_1"]}, - "answer_builder": { - "answers": [GeneratedAnswer(data="Jean", query="Who lives in Paris?", documents=[], meta={})] - }, - }, - { - "answer_builder": { - "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] - } - }, - ] - - runnable_type = "pipeline" - expected_answers = ["Jean", "Mark"] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers - - def test_handle_missing_values(self): - """ - Test that the function correctly handles outputs with missing values. - """ - outputs = [ - {"answer_builder": {"answers": []}}, - { - "answer_builder": { - "answers": [GeneratedAnswer(data="Mark", query="Who lives in Berlin?", documents=[], meta={})] - } - }, - ] - runnable_type = "pipeline" - expected_answers = ["Mark"] - - assert get_answers_from_output(outputs, runnable_type) == expected_answers From a015fc9d1ea4c2410aad68eeefc525769ab2c012 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:00:52 +0530 Subject: [PATCH 06/10] Update release notes --- ...> add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml} | 1 - 1 file changed, 1 deletion(-) rename releasenotes/notes/{add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml => add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml} (77%) diff --git a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml b/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml similarity index 77% rename from releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml rename to releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml index 920eb88d34..ab163705b7 100644 --- a/releasenotes/notes/add-calculate-metrics-metricsresults-exact-match-03bf27ce8b16cff5.yaml +++ b/releasenotes/notes/add-calculate-metrics-metricsresults-03bf27ce8b16cff5.yaml @@ -4,4 +4,3 @@ features: Adds `calculate_metrics()` function to EvaluationResult for computation of evaluation metrics. Adds `Metric` class to store list of available metrics. Adds `MetricsResult` class to store the metric values computed during the evaluation. - Adds a function to EvaluationResult for calculating the Exact Match metric. From f4c71e45d82dc675fa8f27a1266070d0d0c7a5ba Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:18:30 +0530 Subject: [PATCH 07/10] Remove unnecessary metrics implementation --- haystack/evaluation/eval.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 9aee19f892..3a2d053069 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -29,12 +29,6 @@ def __init__( self.outputs = outputs self.expected_outputs = expected_outputs - # Determine the type of the runnable - if str(type(runnable).__name__) == "Pipeline": - self.runnable_type = "pipeline" - else: - self.runnable_type = "component" - # pylint: disable=too-many-return-statements def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult: """ From 50a75bddb0ca7495725a9fbe681c85706a060ac3 Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Wed, 10 Jan 2024 09:46:11 +0100 Subject: [PATCH 08/10] Simplify logic to run supported metrics --- haystack/evaluation/eval.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 3a2d053069..6fdc7138a6 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -29,6 +29,17 @@ def __init__( self.outputs = outputs self.expected_outputs = expected_outputs + # Mapping of metrics to their corresponding functions. + # This should be kept in sync with the Metric enum + self._supported_metrics = { + Metric.RECALL: self._calculate_recall, + Metric.MRR: self._calculate_mrr, + Metric.MAP: self._calculate_map, + Metric.F1: self._calculate_f1, + Metric.EM: self._calculate_em, + Metric.SAS: self._calculate_sas, + } + # pylint: disable=too-many-return-statements def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult: """ @@ -37,23 +48,9 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], :param metric: The Metric indicating the type of metric to calculate or custom function to compute. :return: MetricsResult containing the calculated metric. """ - if metric == Metric.RECALL: - return self._calculate_recall(**kwargs) - - elif metric == Metric.F1: - return self._calculate_f1(**kwargs) - - elif metric == Metric.MRR: - return self._calculate_mrr(**kwargs) - - elif metric == Metric.MAP: - return self._calculate_map(**kwargs) - - elif metric == Metric.EM: - return self._calculate_em(**kwargs) - elif metric == Metric.SAS: - return self._calculate_sas(**kwargs) + if metric in self._supported_metrics: + return self._supported_metrics[metric](**kwargs) return metric(self, **kwargs) From bbf7c4ddf0a75f8e83924074e5f2628e91833afc Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Wed, 10 Jan 2024 09:46:26 +0100 Subject: [PATCH 09/10] Add some evaluation tests --- test/evaluation/test_eval.py | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 test/evaluation/test_eval.py diff --git a/test/evaluation/test_eval.py b/test/evaluation/test_eval.py new file mode 100644 index 0000000000..095c51d4f5 --- /dev/null +++ b/test/evaluation/test_eval.py @@ -0,0 +1,42 @@ +from unittest.mock import MagicMock + +from haystack.core.pipeline import Pipeline +from haystack.evaluation.eval import EvaluationResult +from haystack.evaluation.metrics import Metric + + +class TestEvaluationResult: + def test_init(self): + runnable = Pipeline() + result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) + + assert result.runnable == runnable + assert result.inputs == [] + assert result.outputs == [] + assert result.expected_outputs == [] + + def test_supported_metrics_contains_all_metrics(self): + runnable = Pipeline() + result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) + + supported_metrics = [m.name for m in result._supported_metrics.keys()] + all_metric_names = [m.name for m in Metric] + assert supported_metrics == all_metric_names + + def test_calculate_metrics_with_supported_metric(self): + runnable = Pipeline() + result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) + result._supported_metrics[Metric.RECALL] = MagicMock() + result.calculate_metrics(metric=Metric.RECALL) + + assert result._supported_metrics[Metric.RECALL].called_once_with() + + def test_calculate_metrics_with_non_supported_metric(self): + runnable = Pipeline() + result = EvaluationResult(runnable=runnable, inputs=[], outputs=[], expected_outputs=[]) + + unsupported_metric = MagicMock() + + result.calculate_metrics(metric=unsupported_metric, some_argument="some_value") + + assert unsupported_metric.called_once_with(some_argument="some_value") From 850b57b9cb3e7ec2e744cb3e92cfd57ce39f9de5 Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Wed, 10 Jan 2024 09:57:08 +0100 Subject: [PATCH 10/10] Fix linting --- haystack/evaluation/eval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py index 6fdc7138a6..28b4b76bbe 100644 --- a/haystack/evaluation/eval.py +++ b/haystack/evaluation/eval.py @@ -40,7 +40,6 @@ def __init__( Metric.SAS: self._calculate_sas, } - # pylint: disable=too-many-return-statements def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], **kwargs) -> MetricsResult: """ Calculate evaluation metrics based on the provided Metric or using the custom metric function. @@ -49,7 +48,7 @@ def calculate_metrics(self, metric: Union[Metric, Callable[..., MetricsResult]], :return: MetricsResult containing the calculated metric. """ - if metric in self._supported_metrics: + if isinstance(metric, Metric): return self._supported_metrics[metric](**kwargs) return metric(self, **kwargs)