-
Notifications
You must be signed in to change notification settings - Fork 360
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Kartik Choudhary <[email protected]>
- Loading branch information
1 parent
db46fa9
commit 6f14f90
Showing
5 changed files
with
412 additions
and
0 deletions.
There are no files selected for viewing
81 changes: 81 additions & 0 deletions
81
responsibleai_text/responsibleai_text/rai_text_insights/metrics/coherence.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
"""Groundedness metric.""" | ||
|
||
import datasets | ||
import evaluate | ||
import pandas as pd | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
|
||
_CITATION = """ | ||
""" | ||
|
||
_DESCRIPTION = """The coherence metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_SYS_PROMPT = """ | ||
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. | ||
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. | ||
""".strip() | ||
|
||
_TEMPLATE = """ | ||
Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: | ||
One star: the answer completely lacks coherence | ||
Two stars: the answer mostly lacks coherence | ||
Three stars: the answer is partially coherent | ||
Four stars: the answer is mostly coherent | ||
Five stars: the answer has perfect coherency | ||
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. | ||
QUESTION: | ||
{question} | ||
ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Coherence(evaluate.Metric): | ||
def _info(self): | ||
|
||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence") | ||
} | ||
), | ||
) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
m = [] | ||
templated_ques = [] | ||
|
||
for p, r in zip(predictions, references): | ||
templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) | ||
|
||
model = kwargs['wrapper_model'] | ||
|
||
inp = pd.DataFrame({ | ||
'questions' : templated_ques, | ||
'sys_prompt' : _SYS_PROMPT}) | ||
|
||
responses = model.predict(inp) | ||
|
||
for r in responses: | ||
try: | ||
m.append(int(r)) | ||
except ValueError as e: | ||
logger.warning('Failed to parse metric `%s`: %s', r, e) | ||
m.append(0) | ||
return {'scores' : m} | ||
|
86 changes: 86 additions & 0 deletions
86
responsibleai_text/responsibleai_text/rai_text_insights/metrics/equivalence.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
"""Groundedness metric.""" | ||
|
||
import datasets | ||
import evaluate | ||
import pandas as pd | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
|
||
_CITATION = """ | ||
""" | ||
|
||
_DESCRIPTION = """The equivalence metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_SYS_PROMPT = """ | ||
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. | ||
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. | ||
""".strip() | ||
|
||
_TEMPLATE = """ | ||
Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: | ||
One star: the predicted answer is not at all similar to the correct answer | ||
Two stars: the predicted answer is mostly not similar to the correct answer | ||
Three stars: the predicted answer is somewhat similar to the correct answer | ||
Four stars: the predicted answer is mostly similar to the correct answer | ||
Five stars: the predicted answer is completely similar to the correct answer | ||
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. | ||
QUESTION: | ||
{question} | ||
CORRECT ANSWER: | ||
{answer} | ||
PREDICTED ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Equivalence(evaluate.Metric): | ||
def _info(self): | ||
|
||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence"), | ||
"answers": datasets.Value("string", id="sequence") | ||
} | ||
), | ||
) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
m = [] | ||
templated_ques = [] | ||
|
||
answers = kwargs['answers'] | ||
for p, r, a in zip(predictions, references, answers): | ||
templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a)) | ||
|
||
model = kwargs['wrapper_model'] | ||
|
||
inp = pd.DataFrame({ | ||
'questions' : templated_ques, | ||
'sys_prompt' : _SYS_PROMPT}) | ||
|
||
responses = model.predict(inp) | ||
|
||
for r in responses: | ||
try: | ||
m.append(int(r)) | ||
except ValueError as e: | ||
logger.warning('Failed to parse metric `%s`: %s', r, e) | ||
m.append(0) | ||
return {'scores' : m} | ||
|
81 changes: 81 additions & 0 deletions
81
responsibleai_text/responsibleai_text/rai_text_insights/metrics/fluency.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
"""Groundedness metric.""" | ||
|
||
import datasets | ||
import evaluate | ||
import pandas as pd | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
|
||
_CITATION = """ | ||
""" | ||
|
||
_DESCRIPTION = """The fluency metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_SYS_PROMPT = """ | ||
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. | ||
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. | ||
""".strip() | ||
|
||
_TEMPLATE = """ | ||
Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: | ||
One star: the answer completely lacks fluency | ||
Two stars: the answer mostly lacks fluency | ||
Three stars: the answer is partially fluent | ||
Four stars: the answer is mostly fluent | ||
Five stars: the answer has perfect fluency | ||
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. | ||
QUESTION: | ||
{question} | ||
ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Fluency(evaluate.Metric): | ||
def _info(self): | ||
|
||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence") | ||
} | ||
), | ||
) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
m = [] | ||
templated_ques = [] | ||
|
||
for p, r in zip(predictions, references): | ||
templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) | ||
|
||
model = kwargs['wrapper_model'] | ||
|
||
inp = pd.DataFrame({ | ||
'questions' : templated_ques, | ||
'sys_prompt' : _SYS_PROMPT}) | ||
|
||
responses = model.predict(inp) | ||
|
||
for r in responses: | ||
try: | ||
m.append(int(r)) | ||
except ValueError as e: | ||
logger.warning('Failed to parse metric `%s`: %s', r, e) | ||
m.append(0) | ||
return {'scores' : m} | ||
|
78 changes: 78 additions & 0 deletions
78
responsibleai_text/responsibleai_text/rai_text_insights/metrics/groundedness.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
"""Groundedness metric.""" | ||
|
||
import datasets | ||
import evaluate | ||
import pandas as pd | ||
|
||
logger = evaluate.logging.get_logger(__name__) | ||
|
||
|
||
_CITATION = """ | ||
""" | ||
|
||
_DESCRIPTION = """The groundedness metric. | ||
""" | ||
|
||
_KWARGS_DESCRIPTION = """ | ||
**SOME DESCRIPTION** | ||
""" | ||
|
||
_SYS_PROMPT = """ | ||
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. | ||
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. | ||
""".strip() | ||
|
||
_TEMPLATE = """ | ||
1. 5: The ANSWER follows logically from the information contained in the CONTEXT. | ||
2. 1: The ANSWER is logically false from the information contained in the CONTEXT. | ||
3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. | ||
Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. | ||
Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. | ||
CONTEXT: | ||
{context} | ||
ANSWER: | ||
{prediction} | ||
""".strip() | ||
|
||
|
||
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) | ||
class Groundedness(evaluate.Metric): | ||
def _info(self): | ||
|
||
return evaluate.MetricInfo( | ||
description=_DESCRIPTION, | ||
citation=_CITATION, | ||
inputs_description=_KWARGS_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"predictions": datasets.Value("string", id="sequence"), | ||
"references": datasets.Value("string", id="sequence") | ||
} | ||
), | ||
) | ||
|
||
def _compute(self, *, predictions=None, references=None, **kwargs): | ||
m = [] | ||
templated_ques = [] | ||
|
||
for p, r in zip(predictions, references): | ||
templated_ques.append(_TEMPLATE.format(context=r, prediction=p)) | ||
|
||
model = kwargs['wrapper_model'] | ||
|
||
inp = pd.DataFrame({ | ||
'questions' : templated_ques, | ||
'sys_prompt' : _SYS_PROMPT}) | ||
|
||
responses = model.predict(inp) | ||
|
||
for r in responses: | ||
try: | ||
m.append(int(r)) | ||
except ValueError as e: | ||
logger.warning('Failed to parse metric `%s`: %s', r, e) | ||
m.append(0) | ||
return {'scores' : m} | ||
|
Oops, something went wrong.