Skip to content

Commit

Permalink
Update lm-eval to 0.4.3 (#45)
Browse files Browse the repository at this point in the history
* update lmm-eval to 0.4.3

Signed-off-by: changwangss <[email protected]>

* add high_is_better metrics info

Signed-off-by: changwangss <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: changwangss <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
changwangss and pre-commit-ci[bot] authored Jul 15, 2024
1 parent f9ca6df commit 89c8255
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 9 deletions.
2 changes: 1 addition & 1 deletion evals/evaluation/lm_evaluation_harness/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import numpy as np
from lm_eval import utils
from lm_eval.logging_utils import WandbLogger
from lm_eval.loggers import WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string

Expand Down
45 changes: 39 additions & 6 deletions evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
print_writeout,
run_task_tests,
)
from lm_eval.logging_utils import add_env_info, get_git_commit_hash
from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string

Expand Down Expand Up @@ -472,7 +472,14 @@ def evaluate(
# aggregate results ; run bootstrap CIs
for task_output in eval_tasks:
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
results, samples, configs, versions, num_fewshot = consolidate_results(eval_tasks)
(
results,
samples,
configs,
versions,
num_fewshot,
higher_is_better,
) = consolidate_results(eval_tasks)

### Calculate group metrics ###
if bool(results):
Expand All @@ -483,6 +490,23 @@ def evaluate(
# or `task_name: []`.
# we only want to operate on groups here.
continue

# collect all higher_is_better values for metrics
# in the group's subtasks.
# TODO: clean this up ; unify with the below metric_list loop?
_higher_is_better = {}
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
_higher_is_better[m] = h
if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
eval_logger.warning(
f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
)
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better

# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
Expand All @@ -507,10 +531,8 @@ def evaluate(
else:
results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula,
# comment out the above and uncomment this line:
# results[group][stderr] = \
# lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)

results[group]["samples"] = sum(sizes)

Expand Down Expand Up @@ -540,6 +562,17 @@ def evaluate(
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"higher_is_better": dict(sorted(higher_is_better.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
}
if log_samples:
results_dict["samples"] = dict(samples)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
langchain_community
langchain_huggingface
lm-eval==0.4.2
lm-eval==0.4.3
ragas
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
langchain_community
langchain_huggingface
lm-eval==0.4.2
lm-eval==0.4.3
ragas

0 comments on commit 89c8255

Please sign in to comment.