Skip to content

Commit

Permalink
add genai metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Kartik Choudhary <[email protected]>
  • Loading branch information
kartik727 authored and imatiach-msft committed Jan 23, 2024
1 parent db46fa9 commit 6f14f90
Show file tree
Hide file tree
Showing 5 changed files with 412 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Groundedness metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The coherence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
One star: the answer completely lacks coherence
Two stars: the answer mostly lacks coherence
Three stars: the answer is partially coherent
Four stars: the answer is mostly coherent
Five stars: the answer has perfect coherency
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
QUESTION:
{question}
ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Coherence(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

for p, r in zip(predictions, references):
templated_ques.append(_TEMPLATE.format(question=r, prediction=p))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions' : templated_ques,
'sys_prompt' : _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores' : m}

Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Groundedness metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The equivalence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale:
One star: the predicted answer is not at all similar to the correct answer
Two stars: the predicted answer is mostly not similar to the correct answer
Three stars: the predicted answer is somewhat similar to the correct answer
Four stars: the predicted answer is mostly similar to the correct answer
Five stars: the predicted answer is completely similar to the correct answer
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
QUESTION:
{question}
CORRECT ANSWER:
{answer}
PREDICTED ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Equivalence(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
"answers": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

answers = kwargs['answers']
for p, r, a in zip(predictions, references, answers):
templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions' : templated_ques,
'sys_prompt' : _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores' : m}

Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Groundedness metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The fluency metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale:
One star: the answer completely lacks fluency
Two stars: the answer mostly lacks fluency
Three stars: the answer is partially fluent
Four stars: the answer is mostly fluent
Five stars: the answer has perfect fluency
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
QUESTION:
{question}
ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Fluency(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

for p, r in zip(predictions, references):
templated_ques.append(_TEMPLATE.format(question=r, prediction=p))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions' : templated_ques,
'sys_prompt' : _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores' : m}

Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Groundedness metric."""

import datasets
import evaluate
import pandas as pd

logger = evaluate.logging.get_logger(__name__)


_CITATION = """
"""

_DESCRIPTION = """The groundedness metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else.
""".strip()

_TEMPLATE = """
1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
CONTEXT:
{context}
ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Groundedness(evaluate.Metric):
def _info(self):

return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")
}
),
)

def _compute(self, *, predictions=None, references=None, **kwargs):
m = []
templated_ques = []

for p, r in zip(predictions, references):
templated_ques.append(_TEMPLATE.format(context=r, prediction=p))

model = kwargs['wrapper_model']

inp = pd.DataFrame({
'questions' : templated_ques,
'sys_prompt' : _SYS_PROMPT})

responses = model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores' : m}

Loading

0 comments on commit 6f14f90

Please sign in to comment.