opea-project · XinyuYe-Intel · Sep 24, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 #
 import os
 from typing import Dict, Optional, Union
@@ -26,7 +25,6 @@ def __init__(
         embeddings: Optional[Embeddings] = None,
         metrics: Optional[list[str]] = None,
     ):
-
         self.threshold = threshold
         self.model = model
         self.embeddings = embeddings
@@ -42,7 +40,13 @@ def __init__(
             "reference_free_rubrics_score",
         ]
 
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
         try:
+            from ragas import evaluate
             from ragas.metrics import (
                 answer_correctness,
                 answer_relevancy,
@@ -55,12 +59,10 @@ def __init__(
             )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
-
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-
         self.metrics_instance = {
             "answer_correctness": answer_correctness,
             "answer_relevancy": answer_relevancy,
@@ -71,7 +73,6 @@ def __init__(
             "context_utilization": context_utilization,
             "reference_free_rubrics_score": reference_free_rubrics_score,
         }
-
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
@@ -81,14 +82,13 @@ def __init__(
             print("LLM endpoint: ", self.model)
             self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
-                task="text-generation",
-                max_new_tokens=1024,
-                do_sample=False,
+                timeout=600,
             )
         else:
+            print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
             self.chat_model = self.model
-
-        # initialize metrics
+        # Create a dataset from the test case
+        # Convert the Dict to a format compatible with Dataset
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -106,10 +106,8 @@ def __init__(
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
-
             self.metrics = tmp_metrics
-
-        else:  # default metrics
+        else:
             self.metrics = [
                 answer_relevancy,
                 faithfulness,
@@ -118,28 +116,34 @@ def __init__(
                 context_precision,
                 context_recall,
             ]
-
-    async def a_measure(self, test_case: Dict):
-        return self.measure(test_case)
-
-    def measure(self, test_case: Dict):
-        from ragas import evaluate
-
-        try:
-            from datasets import Dataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install dataset")
-
-        # Create a dataset from the test case
-        # Convert the Dict to a format compatible with Dataset
-        data = {
-            "question": test_case["question"],
-            "contexts": test_case["contexts"],
-            "answer": test_case["answer"],
-            "ground_truth": test_case["ground_truth"],
+        # Find necessary input fields using the given metrics
+        _required_columns = set()
+        is_latest = faithfulness
+        column_map = {  # this column maps new naming style in ragas to their old naming style
+            "user_input": "question",
+            "response": "answer",
+            "reference": "ground_truth",
+            "retrieved_contexts": "contexts",
         }
+        for metric in self.metrics:
+            if hasattr(metric, "_required_columns"):
+                for column in list(metric._required_columns.values())[0]:
+                    _required_columns.add(column_map[column])
+            elif hasattr(metric, "evaluation_mode"):
+                from ragas.metrics.base import get_required_columns
+
+                for column in get_required_columns(metric.evaluation_mode):
+                    _required_columns.add(column)
+            else:
+                print("metric has no attribute denoting required columns")
+
+        print("Required columns for given list of metrics are = {}".format(_required_columns))
+
+        # get only necessary columns from test case
+        data = {column: test_case[column] for column in _required_columns}
         dataset = Dataset.from_dict(data)
 
+        # evaluate
         self.score = evaluate(
             dataset,
             metrics=self.metrics,

@@ -1,4 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118
+jieba
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3