Update lm-eval to 0.4.3 (#45)

* update lmm-eval to 0.4.3 Signed-off-by: changwangss <[email protected]> * add high_is_better metrics info Signed-off-by: changwangss <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: changwangss <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Jul 15, 2024 · 89c8255 · 89c8255
1 parent f9ca6df
commit 89c8255
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 9 deletions.
diff --git a/evals/evaluation/lm_evaluation_harness/accuracy.py b/evals/evaluation/lm_evaluation_harness/accuracy.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from lm_eval import utils
-from lm_eval.logging_utils import WandbLogger
+from lm_eval.loggers import WandbLogger
 from lm_eval.tasks import TaskManager
 from lm_eval.utils import make_table, simple_parse_args_string
 

diff --git a/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py b/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py
@@ -28,7 +28,7 @@
     print_writeout,
     run_task_tests,
 )
-from lm_eval.logging_utils import add_env_info, get_git_commit_hash
+from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
 from lm_eval.tasks import TaskManager, get_task_dict
 from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
 
@@ -472,7 +472,14 @@ def evaluate(
         # aggregate results ; run bootstrap CIs
         for task_output in eval_tasks:
             task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
-        results, samples, configs, versions, num_fewshot = consolidate_results(eval_tasks)
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)
 
         ### Calculate group metrics ###
         if bool(results):
@@ -483,6 +490,23 @@ def evaluate(
                     # or `task_name: []`.
                     # we only want to operate on groups here.
                     continue
+
+                # collect all higher_is_better values for metrics
+                # in the group's subtasks.
+                # TODO: clean this up ; unify with the below metric_list loop?
+                _higher_is_better = {}
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+                    if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
+                        eval_logger.warning(
+                            f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                        )
+                        _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
+
+                # collect all metric keys used by a subtask in the group.
                 metric_list = list(
                     {
                         key
@@ -507,10 +531,8 @@ def evaluate(
                     else:
                         results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
                         # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                        # To use the old (likely incorrect) variance formula,
-                        # comment out the above and uncomment this line:
-                        # results[group][stderr] = \
-                        # lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
+                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
+                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
 
                     results[group]["samples"] = sum(sizes)
 
@@ -540,6 +562,17 @@ def evaluate(
             "configs": dict(sorted(configs.items())),
             "versions": dict(sorted(versions.items())),
             "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output in eval_tasks
+            },
         }
         if log_samples:
             results_dict["samples"] = dict(samples)

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
 langchain_community
 langchain_huggingface
-lm-eval==0.4.2
+lm-eval==0.4.3
 ragas
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,5 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
 langchain_community
 langchain_huggingface
-lm-eval==0.4.2
+lm-eval==0.4.3
 ragas