From 6f14f90cad0d74929d0cc77935c0f02007d99f66 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Wed, 17 Jan 2024 10:53:53 -0500
Subject: [PATCH] add genai metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
---
 .../rai_text_insights/metrics/coherence.py    | 81 +++++++++++++++++
 .../rai_text_insights/metrics/equivalence.py  | 86 +++++++++++++++++++
 .../rai_text_insights/metrics/fluency.py      | 81 +++++++++++++++++
 .../rai_text_insights/metrics/groundedness.py | 78 +++++++++++++++++
 .../rai_text_insights/metrics/relevance.py    | 86 +++++++++++++++++++
 5 files changed, 412 insertions(+)
 create mode 100644 responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py
 create mode 100644 responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py
 create mode 100644 responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py
 create mode 100644 responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py
 create mode 100644 responsibleai_text/responsibleai_text/rai_text_insights/metrics/relevance.py

diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py
new file mode 100644
index 0000000000..e27d9f3eb4
--- /dev/null
+++ b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py
@@ -0,0 +1,81 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The coherence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
+One star: the answer completely lacks coherence
+Two stars: the answer mostly lacks coherence
+Three stars: the answer is partially coherent
+Four stars: the answer is mostly coherent
+Five stars: the answer has perfect coherency
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Coherence(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+            
\ No newline at end of file
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py
new file mode 100644
index 0000000000..e2acba7eb6
--- /dev/null
+++ b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py
@@ -0,0 +1,86 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The equivalence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
+One star: the predicted answer is not at all similar to the correct answer
+Two stars: the predicted answer is mostly not similar to the correct answer
+Three stars: the predicted answer is somewhat similar to the correct answer
+Four stars: the predicted answer is mostly similar to the correct answer
+Five stars: the predicted answer is completely similar to the correct answer
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+CORRECT ANSWER:
+{answer}
+
+PREDICTED ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Equivalence(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                    "answers": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        answers = kwargs['answers']
+        for p, r, a in zip(predictions, references, answers):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+            
\ No newline at end of file
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py
new file mode 100644
index 0000000000..169a88bec9
--- /dev/null
+++ b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py
@@ -0,0 +1,81 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The fluency metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
+One star: the answer completely lacks fluency
+Two stars: the answer mostly lacks fluency
+Three stars: the answer is partially fluent
+Four stars: the answer is mostly fluent
+Five stars: the answer has perfect fluency
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Fluency(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(question=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+            
\ No newline at end of file
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py
new file mode 100644
index 0000000000..8d4f42bc16
--- /dev/null
+++ b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py
@@ -0,0 +1,78 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The groundedness metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
+2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
+3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
+Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
+Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
+
+CONTEXT:
+{context}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Groundedness(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        for p, r in zip(predictions, references):
+            templated_ques.append(_TEMPLATE.format(context=r, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+            
\ No newline at end of file
diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/metrics/relevance.py b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/relevance.py
new file mode 100644
index 0000000000..b2d736d220
--- /dev/null
+++ b/responsibleai_text/responsibleai_text/rai_text_insights/metrics/relevance.py
@@ -0,0 +1,86 @@
+"""Groundedness metric."""
+
+import datasets
+import evaluate
+import pandas as pd
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """The relevance metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
+
+_TEMPLATE = """
+Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
+One star: the answer completely lacks relevance
+Two stars: the answer mostly lacks relevance
+Three stars: the answer is partially relevant
+Four stars: the answer is mostly relevant
+Five stars: the answer has perfect relevance
+
+This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
+
+CONTEXT:
+{context}
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Relevance(evaluate.Metric):
+    def _info(self):
+
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                    "questions": datasets.Value("string", id="sequence")
+                }
+            ),
+        )
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        m = []
+        templated_ques = []
+
+        questions = kwargs['questions']
+        for p, r, q in zip(predictions, references, questions):
+            templated_ques.append(_TEMPLATE.format(context=r, question=q, prediction=p))
+
+        model = kwargs['wrapper_model']
+
+        inp = pd.DataFrame({
+            'questions' : templated_ques,
+            'sys_prompt' : _SYS_PROMPT})
+
+        responses = model.predict(inp)
+
+        for r in responses:
+            try:
+                m.append(int(r))
+            except ValueError as e:
+                logger.warning('Failed to parse metric `%s`: %s', r, e)
+                m.append(0)
+        return {'scores' : m}
+            
\ No newline at end of file