add genai metrics

Signed-off-by: Kartik Choudhary <[email protected]>
microsoft · Jan 23, 2024 · 6f14f90 · 6f14f90
1 parent db46fa9
commit 6f14f90
Show file tree

Hide file tree

Showing 5 changed files with 412 additions and 0 deletions.
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py
@@ -0,0 +1,81 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The coherence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
+One star: the answer completely lacks coherence
+Two stars: the answer mostly lacks coherence
+Three stars: the answer is partially coherent
+Four stars: the answer is mostly coherent
+Five stars: the answer has perfect coherency
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Coherence(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py
@@ -0,0 +1,86 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The equivalence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
+One star: the predicted answer is not at all similar to the correct answer
+Two stars: the predicted answer is mostly not similar to the correct answer
+Three stars: the predicted answer is somewhat similar to the correct answer
+Four stars: the predicted answer is mostly similar to the correct answer
+Five stars: the predicted answer is completely similar to the correct answer
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+CORRECT ANSWER:
+{answer}
+
+PREDICTED ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Equivalence(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                    "answers": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        answers = kwargs['answers']
+        for p, r, a in zip(predictions, references, answers):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py
@@ -0,0 +1,81 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The fluency metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
+One star: the answer completely lacks fluency
+Two stars: the answer mostly lacks fluency
+Three stars: the answer is partially fluent
+Four stars: the answer is mostly fluent
+Five stars: the answer has perfect fluency
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Fluency(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py
@@ -0,0 +1,78 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The groundedness metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
+2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
+3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
+Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
+Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
+
+CONTEXT:
+{context}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Groundedness(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(context=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+